Convert nltk codes into version 3.4 96/245996/1
authorjay.ho.park <jay.ho.park@samsung.com>
Thu, 22 Oct 2020 04:56:46 +0000 (13:56 +0900)
committerjay.ho.park <jay.ho.park@samsung.com>
Thu, 22 Oct 2020 05:02:24 +0000 (14:02 +0900)
Change-Id: I8fbc3822305a67ea3aed093100f7f4711b96d3db

737 files changed:
nlp_resource_data/nltk/VERSION
nlp_resource_data/nltk/__init__.py
nlp_resource_data/nltk/__pycache__/__init__.cpython-37.pyc
nlp_resource_data/nltk/__pycache__/book.cpython-37.pyc
nlp_resource_data/nltk/__pycache__/cli.cpython-37.pyc [deleted file]
nlp_resource_data/nltk/__pycache__/collections.cpython-37.pyc
nlp_resource_data/nltk/__pycache__/collocations.cpython-37.pyc
nlp_resource_data/nltk/__pycache__/compat.cpython-37.pyc
nlp_resource_data/nltk/__pycache__/data.cpython-37.pyc
nlp_resource_data/nltk/__pycache__/decorators.cpython-37.pyc
nlp_resource_data/nltk/__pycache__/downloader.cpython-37.pyc
nlp_resource_data/nltk/__pycache__/featstruct.cpython-37.pyc
nlp_resource_data/nltk/__pycache__/grammar.cpython-37.pyc
nlp_resource_data/nltk/__pycache__/help.cpython-37.pyc
nlp_resource_data/nltk/__pycache__/internals.cpython-37.pyc
nlp_resource_data/nltk/__pycache__/jsontags.cpython-37.pyc
nlp_resource_data/nltk/__pycache__/lazyimport.cpython-37.pyc
nlp_resource_data/nltk/__pycache__/probability.cpython-37.pyc
nlp_resource_data/nltk/__pycache__/text.cpython-37.pyc
nlp_resource_data/nltk/__pycache__/tgrep.cpython-37.pyc
nlp_resource_data/nltk/__pycache__/toolbox.cpython-37.pyc
nlp_resource_data/nltk/__pycache__/tree.cpython-37.pyc
nlp_resource_data/nltk/__pycache__/treeprettyprinter.cpython-37.pyc
nlp_resource_data/nltk/__pycache__/treetransforms.cpython-37.pyc
nlp_resource_data/nltk/__pycache__/util.cpython-37.pyc
nlp_resource_data/nltk/__pycache__/wsd.cpython-37.pyc
nlp_resource_data/nltk/app/__init__.py
nlp_resource_data/nltk/app/__pycache__/__init__.cpython-37.pyc
nlp_resource_data/nltk/app/__pycache__/chartparser_app.cpython-37.pyc
nlp_resource_data/nltk/app/__pycache__/chunkparser_app.cpython-37.pyc
nlp_resource_data/nltk/app/__pycache__/collocations_app.cpython-37.pyc
nlp_resource_data/nltk/app/__pycache__/concordance_app.cpython-37.pyc
nlp_resource_data/nltk/app/__pycache__/nemo_app.cpython-37.pyc
nlp_resource_data/nltk/app/__pycache__/rdparser_app.cpython-37.pyc
nlp_resource_data/nltk/app/__pycache__/srparser_app.cpython-37.pyc
nlp_resource_data/nltk/app/__pycache__/wordfreq_app.cpython-37.pyc
nlp_resource_data/nltk/app/__pycache__/wordnet_app.cpython-37.pyc
nlp_resource_data/nltk/app/chartparser_app.py
nlp_resource_data/nltk/app/chunkparser_app.py
nlp_resource_data/nltk/app/collocations_app.py
nlp_resource_data/nltk/app/concordance_app.py
nlp_resource_data/nltk/app/nemo_app.py
nlp_resource_data/nltk/app/rdparser_app.py
nlp_resource_data/nltk/app/srparser_app.py
nlp_resource_data/nltk/app/wordfreq_app.py
nlp_resource_data/nltk/app/wordnet_app.py
nlp_resource_data/nltk/book.py
nlp_resource_data/nltk/ccg/__init__.py
nlp_resource_data/nltk/ccg/__pycache__/__init__.cpython-37.pyc
nlp_resource_data/nltk/ccg/__pycache__/api.cpython-37.pyc
nlp_resource_data/nltk/ccg/__pycache__/chart.cpython-37.pyc
nlp_resource_data/nltk/ccg/__pycache__/combinator.cpython-37.pyc
nlp_resource_data/nltk/ccg/__pycache__/lexicon.cpython-37.pyc
nlp_resource_data/nltk/ccg/__pycache__/logic.cpython-37.pyc
nlp_resource_data/nltk/ccg/api.py
nlp_resource_data/nltk/ccg/chart.py
nlp_resource_data/nltk/ccg/combinator.py
nlp_resource_data/nltk/ccg/lexicon.py
nlp_resource_data/nltk/ccg/logic.py
nlp_resource_data/nltk/chat/__init__.py
nlp_resource_data/nltk/chat/__pycache__/__init__.cpython-37.pyc
nlp_resource_data/nltk/chat/__pycache__/eliza.cpython-37.pyc
nlp_resource_data/nltk/chat/__pycache__/iesha.cpython-37.pyc
nlp_resource_data/nltk/chat/__pycache__/rude.cpython-37.pyc
nlp_resource_data/nltk/chat/__pycache__/suntsu.cpython-37.pyc
nlp_resource_data/nltk/chat/__pycache__/util.cpython-37.pyc
nlp_resource_data/nltk/chat/__pycache__/zen.cpython-37.pyc
nlp_resource_data/nltk/chat/eliza.py
nlp_resource_data/nltk/chat/iesha.py
nlp_resource_data/nltk/chat/rude.py
nlp_resource_data/nltk/chat/suntsu.py
nlp_resource_data/nltk/chat/util.py
nlp_resource_data/nltk/chat/zen.py
nlp_resource_data/nltk/chunk/__init__.py
nlp_resource_data/nltk/chunk/__pycache__/__init__.cpython-37.pyc
nlp_resource_data/nltk/chunk/__pycache__/api.cpython-37.pyc
nlp_resource_data/nltk/chunk/__pycache__/named_entity.cpython-37.pyc
nlp_resource_data/nltk/chunk/__pycache__/regexp.cpython-37.pyc
nlp_resource_data/nltk/chunk/__pycache__/util.cpython-37.pyc
nlp_resource_data/nltk/chunk/api.py
nlp_resource_data/nltk/chunk/named_entity.py
nlp_resource_data/nltk/chunk/regexp.py
nlp_resource_data/nltk/chunk/util.py
nlp_resource_data/nltk/classify/__init__.py
nlp_resource_data/nltk/classify/__pycache__/__init__.cpython-37.pyc
nlp_resource_data/nltk/classify/__pycache__/api.cpython-37.pyc
nlp_resource_data/nltk/classify/__pycache__/decisiontree.cpython-37.pyc
nlp_resource_data/nltk/classify/__pycache__/maxent.cpython-37.pyc
nlp_resource_data/nltk/classify/__pycache__/megam.cpython-37.pyc
nlp_resource_data/nltk/classify/__pycache__/naivebayes.cpython-37.pyc
nlp_resource_data/nltk/classify/__pycache__/positivenaivebayes.cpython-37.pyc
nlp_resource_data/nltk/classify/__pycache__/rte_classify.cpython-37.pyc
nlp_resource_data/nltk/classify/__pycache__/scikitlearn.cpython-37.pyc
nlp_resource_data/nltk/classify/__pycache__/senna.cpython-37.pyc
nlp_resource_data/nltk/classify/__pycache__/svm.cpython-37.pyc
nlp_resource_data/nltk/classify/__pycache__/tadm.cpython-37.pyc
nlp_resource_data/nltk/classify/__pycache__/textcat.cpython-37.pyc
nlp_resource_data/nltk/classify/__pycache__/util.cpython-37.pyc
nlp_resource_data/nltk/classify/__pycache__/weka.cpython-37.pyc
nlp_resource_data/nltk/classify/api.py
nlp_resource_data/nltk/classify/decisiontree.py
nlp_resource_data/nltk/classify/maxent.py
nlp_resource_data/nltk/classify/megam.py
nlp_resource_data/nltk/classify/naivebayes.py
nlp_resource_data/nltk/classify/positivenaivebayes.py
nlp_resource_data/nltk/classify/rte_classify.py
nlp_resource_data/nltk/classify/scikitlearn.py
nlp_resource_data/nltk/classify/senna.py
nlp_resource_data/nltk/classify/svm.py
nlp_resource_data/nltk/classify/tadm.py
nlp_resource_data/nltk/classify/textcat.py
nlp_resource_data/nltk/classify/util.py
nlp_resource_data/nltk/classify/weka.py
nlp_resource_data/nltk/cli.py [deleted file]
nlp_resource_data/nltk/cluster/__init__.py
nlp_resource_data/nltk/cluster/__pycache__/__init__.cpython-37.pyc
nlp_resource_data/nltk/cluster/__pycache__/api.cpython-37.pyc
nlp_resource_data/nltk/cluster/__pycache__/em.cpython-37.pyc
nlp_resource_data/nltk/cluster/__pycache__/gaac.cpython-37.pyc
nlp_resource_data/nltk/cluster/__pycache__/kmeans.cpython-37.pyc
nlp_resource_data/nltk/cluster/__pycache__/util.cpython-37.pyc
nlp_resource_data/nltk/cluster/api.py
nlp_resource_data/nltk/cluster/em.py
nlp_resource_data/nltk/cluster/gaac.py
nlp_resource_data/nltk/cluster/kmeans.py
nlp_resource_data/nltk/cluster/util.py
nlp_resource_data/nltk/collections.py
nlp_resource_data/nltk/collocations.py
nlp_resource_data/nltk/compat.py
nlp_resource_data/nltk/corpus/__init__.py
nlp_resource_data/nltk/corpus/__pycache__/__init__.cpython-37.pyc
nlp_resource_data/nltk/corpus/__pycache__/europarl_raw.cpython-37.pyc
nlp_resource_data/nltk/corpus/__pycache__/util.cpython-37.pyc
nlp_resource_data/nltk/corpus/europarl_raw.py
nlp_resource_data/nltk/corpus/reader/__init__.py
nlp_resource_data/nltk/corpus/reader/__pycache__/__init__.cpython-37.pyc
nlp_resource_data/nltk/corpus/reader/__pycache__/aligned.cpython-37.pyc
nlp_resource_data/nltk/corpus/reader/__pycache__/api.cpython-37.pyc
nlp_resource_data/nltk/corpus/reader/__pycache__/bnc.cpython-37.pyc
nlp_resource_data/nltk/corpus/reader/__pycache__/bracket_parse.cpython-37.pyc
nlp_resource_data/nltk/corpus/reader/__pycache__/categorized_sents.cpython-37.pyc
nlp_resource_data/nltk/corpus/reader/__pycache__/chasen.cpython-37.pyc
nlp_resource_data/nltk/corpus/reader/__pycache__/childes.cpython-37.pyc
nlp_resource_data/nltk/corpus/reader/__pycache__/chunked.cpython-37.pyc
nlp_resource_data/nltk/corpus/reader/__pycache__/cmudict.cpython-37.pyc
nlp_resource_data/nltk/corpus/reader/__pycache__/comparative_sents.cpython-37.pyc
nlp_resource_data/nltk/corpus/reader/__pycache__/conll.cpython-37.pyc
nlp_resource_data/nltk/corpus/reader/__pycache__/crubadan.cpython-37.pyc
nlp_resource_data/nltk/corpus/reader/__pycache__/dependency.cpython-37.pyc
nlp_resource_data/nltk/corpus/reader/__pycache__/framenet.cpython-37.pyc
nlp_resource_data/nltk/corpus/reader/__pycache__/ieer.cpython-37.pyc
nlp_resource_data/nltk/corpus/reader/__pycache__/indian.cpython-37.pyc
nlp_resource_data/nltk/corpus/reader/__pycache__/ipipan.cpython-37.pyc
nlp_resource_data/nltk/corpus/reader/__pycache__/knbc.cpython-37.pyc
nlp_resource_data/nltk/corpus/reader/__pycache__/lin.cpython-37.pyc
nlp_resource_data/nltk/corpus/reader/__pycache__/mte.cpython-37.pyc
nlp_resource_data/nltk/corpus/reader/__pycache__/nkjp.cpython-37.pyc
nlp_resource_data/nltk/corpus/reader/__pycache__/nombank.cpython-37.pyc
nlp_resource_data/nltk/corpus/reader/__pycache__/nps_chat.cpython-37.pyc
nlp_resource_data/nltk/corpus/reader/__pycache__/opinion_lexicon.cpython-37.pyc
nlp_resource_data/nltk/corpus/reader/__pycache__/panlex_lite.cpython-37.pyc
nlp_resource_data/nltk/corpus/reader/__pycache__/panlex_swadesh.cpython-37.pyc [deleted file]
nlp_resource_data/nltk/corpus/reader/__pycache__/pl196x.cpython-37.pyc
nlp_resource_data/nltk/corpus/reader/__pycache__/plaintext.cpython-37.pyc
nlp_resource_data/nltk/corpus/reader/__pycache__/ppattach.cpython-37.pyc
nlp_resource_data/nltk/corpus/reader/__pycache__/propbank.cpython-37.pyc
nlp_resource_data/nltk/corpus/reader/__pycache__/pros_cons.cpython-37.pyc
nlp_resource_data/nltk/corpus/reader/__pycache__/reviews.cpython-37.pyc
nlp_resource_data/nltk/corpus/reader/__pycache__/rte.cpython-37.pyc
nlp_resource_data/nltk/corpus/reader/__pycache__/semcor.cpython-37.pyc
nlp_resource_data/nltk/corpus/reader/__pycache__/senseval.cpython-37.pyc
nlp_resource_data/nltk/corpus/reader/__pycache__/sentiwordnet.cpython-37.pyc
nlp_resource_data/nltk/corpus/reader/__pycache__/sinica_treebank.cpython-37.pyc
nlp_resource_data/nltk/corpus/reader/__pycache__/string_category.cpython-37.pyc
nlp_resource_data/nltk/corpus/reader/__pycache__/switchboard.cpython-37.pyc
nlp_resource_data/nltk/corpus/reader/__pycache__/tagged.cpython-37.pyc
nlp_resource_data/nltk/corpus/reader/__pycache__/timit.cpython-37.pyc
nlp_resource_data/nltk/corpus/reader/__pycache__/toolbox.cpython-37.pyc
nlp_resource_data/nltk/corpus/reader/__pycache__/twitter.cpython-37.pyc
nlp_resource_data/nltk/corpus/reader/__pycache__/udhr.cpython-37.pyc
nlp_resource_data/nltk/corpus/reader/__pycache__/util.cpython-37.pyc
nlp_resource_data/nltk/corpus/reader/__pycache__/verbnet.cpython-37.pyc
nlp_resource_data/nltk/corpus/reader/__pycache__/wordlist.cpython-37.pyc
nlp_resource_data/nltk/corpus/reader/__pycache__/wordnet.cpython-37.pyc
nlp_resource_data/nltk/corpus/reader/__pycache__/xmldocs.cpython-37.pyc
nlp_resource_data/nltk/corpus/reader/__pycache__/ycoe.cpython-37.pyc
nlp_resource_data/nltk/corpus/reader/aligned.py
nlp_resource_data/nltk/corpus/reader/api.py
nlp_resource_data/nltk/corpus/reader/bnc.py
nlp_resource_data/nltk/corpus/reader/bracket_parse.py
nlp_resource_data/nltk/corpus/reader/categorized_sents.py
nlp_resource_data/nltk/corpus/reader/chasen.py
nlp_resource_data/nltk/corpus/reader/childes.py
nlp_resource_data/nltk/corpus/reader/chunked.py
nlp_resource_data/nltk/corpus/reader/cmudict.py
nlp_resource_data/nltk/corpus/reader/comparative_sents.py
nlp_resource_data/nltk/corpus/reader/conll.py
nlp_resource_data/nltk/corpus/reader/crubadan.py
nlp_resource_data/nltk/corpus/reader/dependency.py
nlp_resource_data/nltk/corpus/reader/framenet.py
nlp_resource_data/nltk/corpus/reader/ieer.py
nlp_resource_data/nltk/corpus/reader/indian.py
nlp_resource_data/nltk/corpus/reader/ipipan.py
nlp_resource_data/nltk/corpus/reader/knbc.py
nlp_resource_data/nltk/corpus/reader/lin.py
nlp_resource_data/nltk/corpus/reader/mte.py
nlp_resource_data/nltk/corpus/reader/nkjp.py
nlp_resource_data/nltk/corpus/reader/nombank.py
nlp_resource_data/nltk/corpus/reader/nps_chat.py
nlp_resource_data/nltk/corpus/reader/opinion_lexicon.py
nlp_resource_data/nltk/corpus/reader/panlex_lite.py
nlp_resource_data/nltk/corpus/reader/panlex_swadesh.py [deleted file]
nlp_resource_data/nltk/corpus/reader/pl196x.py
nlp_resource_data/nltk/corpus/reader/plaintext.py
nlp_resource_data/nltk/corpus/reader/ppattach.py
nlp_resource_data/nltk/corpus/reader/propbank.py
nlp_resource_data/nltk/corpus/reader/pros_cons.py
nlp_resource_data/nltk/corpus/reader/reviews.py
nlp_resource_data/nltk/corpus/reader/rte.py
nlp_resource_data/nltk/corpus/reader/semcor.py
nlp_resource_data/nltk/corpus/reader/senseval.py
nlp_resource_data/nltk/corpus/reader/sentiwordnet.py
nlp_resource_data/nltk/corpus/reader/sinica_treebank.py
nlp_resource_data/nltk/corpus/reader/string_category.py
nlp_resource_data/nltk/corpus/reader/switchboard.py
nlp_resource_data/nltk/corpus/reader/tagged.py
nlp_resource_data/nltk/corpus/reader/timit.py
nlp_resource_data/nltk/corpus/reader/toolbox.py
nlp_resource_data/nltk/corpus/reader/twitter.py
nlp_resource_data/nltk/corpus/reader/udhr.py
nlp_resource_data/nltk/corpus/reader/util.py
nlp_resource_data/nltk/corpus/reader/verbnet.py
nlp_resource_data/nltk/corpus/reader/wordlist.py
nlp_resource_data/nltk/corpus/reader/wordnet.py
nlp_resource_data/nltk/corpus/reader/xmldocs.py
nlp_resource_data/nltk/corpus/reader/ycoe.py
nlp_resource_data/nltk/corpus/util.py
nlp_resource_data/nltk/data.py
nlp_resource_data/nltk/decorators.py
nlp_resource_data/nltk/downloader.py
nlp_resource_data/nltk/draw/__init__.py
nlp_resource_data/nltk/draw/__pycache__/__init__.cpython-37.pyc
nlp_resource_data/nltk/draw/__pycache__/cfg.cpython-37.pyc
nlp_resource_data/nltk/draw/__pycache__/dispersion.cpython-37.pyc
nlp_resource_data/nltk/draw/__pycache__/table.cpython-37.pyc
nlp_resource_data/nltk/draw/__pycache__/tree.cpython-37.pyc
nlp_resource_data/nltk/draw/__pycache__/util.cpython-37.pyc
nlp_resource_data/nltk/draw/cfg.py
nlp_resource_data/nltk/draw/dispersion.py
nlp_resource_data/nltk/draw/table.py
nlp_resource_data/nltk/draw/tree.py
nlp_resource_data/nltk/draw/util.py
nlp_resource_data/nltk/featstruct.py
nlp_resource_data/nltk/grammar.py
nlp_resource_data/nltk/help.py
nlp_resource_data/nltk/inference/__init__.py
nlp_resource_data/nltk/inference/__pycache__/__init__.cpython-37.pyc
nlp_resource_data/nltk/inference/__pycache__/api.cpython-37.pyc
nlp_resource_data/nltk/inference/__pycache__/discourse.cpython-37.pyc
nlp_resource_data/nltk/inference/__pycache__/mace.cpython-37.pyc
nlp_resource_data/nltk/inference/__pycache__/nonmonotonic.cpython-37.pyc
nlp_resource_data/nltk/inference/__pycache__/prover9.cpython-37.pyc
nlp_resource_data/nltk/inference/__pycache__/resolution.cpython-37.pyc
nlp_resource_data/nltk/inference/__pycache__/tableau.cpython-37.pyc
nlp_resource_data/nltk/inference/api.py
nlp_resource_data/nltk/inference/discourse.py
nlp_resource_data/nltk/inference/mace.py
nlp_resource_data/nltk/inference/nonmonotonic.py
nlp_resource_data/nltk/inference/prover9.py
nlp_resource_data/nltk/inference/resolution.py
nlp_resource_data/nltk/inference/tableau.py
nlp_resource_data/nltk/internals.py
nlp_resource_data/nltk/jsontags.py
nlp_resource_data/nltk/lazyimport.py
nlp_resource_data/nltk/lm/__init__.py
nlp_resource_data/nltk/lm/__pycache__/__init__.cpython-37.pyc
nlp_resource_data/nltk/lm/__pycache__/api.cpython-37.pyc
nlp_resource_data/nltk/lm/__pycache__/counter.cpython-37.pyc
nlp_resource_data/nltk/lm/__pycache__/models.cpython-37.pyc
nlp_resource_data/nltk/lm/__pycache__/preprocessing.cpython-37.pyc
nlp_resource_data/nltk/lm/__pycache__/smoothing.cpython-37.pyc
nlp_resource_data/nltk/lm/__pycache__/util.cpython-37.pyc
nlp_resource_data/nltk/lm/__pycache__/vocabulary.cpython-37.pyc
nlp_resource_data/nltk/lm/api.py
nlp_resource_data/nltk/lm/counter.py
nlp_resource_data/nltk/lm/models.py
nlp_resource_data/nltk/lm/preprocessing.py
nlp_resource_data/nltk/lm/smoothing.py
nlp_resource_data/nltk/lm/util.py
nlp_resource_data/nltk/lm/vocabulary.py
nlp_resource_data/nltk/metrics/__init__.py
nlp_resource_data/nltk/metrics/__pycache__/__init__.cpython-37.pyc
nlp_resource_data/nltk/metrics/__pycache__/agreement.cpython-37.pyc
nlp_resource_data/nltk/metrics/__pycache__/aline.cpython-37.pyc
nlp_resource_data/nltk/metrics/__pycache__/association.cpython-37.pyc
nlp_resource_data/nltk/metrics/__pycache__/confusionmatrix.cpython-37.pyc
nlp_resource_data/nltk/metrics/__pycache__/distance.cpython-37.pyc
nlp_resource_data/nltk/metrics/__pycache__/paice.cpython-37.pyc
nlp_resource_data/nltk/metrics/__pycache__/scores.cpython-37.pyc
nlp_resource_data/nltk/metrics/__pycache__/segmentation.cpython-37.pyc
nlp_resource_data/nltk/metrics/__pycache__/spearman.cpython-37.pyc
nlp_resource_data/nltk/metrics/agreement.py
nlp_resource_data/nltk/metrics/aline.py
nlp_resource_data/nltk/metrics/association.py
nlp_resource_data/nltk/metrics/confusionmatrix.py
nlp_resource_data/nltk/metrics/distance.py
nlp_resource_data/nltk/metrics/paice.py
nlp_resource_data/nltk/metrics/scores.py
nlp_resource_data/nltk/metrics/segmentation.py
nlp_resource_data/nltk/metrics/spearman.py
nlp_resource_data/nltk/misc/__init__.py
nlp_resource_data/nltk/misc/__pycache__/__init__.cpython-37.pyc
nlp_resource_data/nltk/misc/__pycache__/babelfish.cpython-37.pyc
nlp_resource_data/nltk/misc/__pycache__/chomsky.cpython-37.pyc
nlp_resource_data/nltk/misc/__pycache__/minimalset.cpython-37.pyc
nlp_resource_data/nltk/misc/__pycache__/sort.cpython-37.pyc
nlp_resource_data/nltk/misc/__pycache__/wordfinder.cpython-37.pyc
nlp_resource_data/nltk/misc/babelfish.py
nlp_resource_data/nltk/misc/chomsky.py
nlp_resource_data/nltk/misc/minimalset.py
nlp_resource_data/nltk/misc/sort.py
nlp_resource_data/nltk/misc/wordfinder.py
nlp_resource_data/nltk/parse/__init__.py
nlp_resource_data/nltk/parse/__pycache__/__init__.cpython-37.pyc
nlp_resource_data/nltk/parse/__pycache__/api.cpython-37.pyc
nlp_resource_data/nltk/parse/__pycache__/bllip.cpython-37.pyc
nlp_resource_data/nltk/parse/__pycache__/chart.cpython-37.pyc
nlp_resource_data/nltk/parse/__pycache__/corenlp.cpython-37.pyc
nlp_resource_data/nltk/parse/__pycache__/dependencygraph.cpython-37.pyc
nlp_resource_data/nltk/parse/__pycache__/earleychart.cpython-37.pyc
nlp_resource_data/nltk/parse/__pycache__/evaluate.cpython-37.pyc
nlp_resource_data/nltk/parse/__pycache__/featurechart.cpython-37.pyc
nlp_resource_data/nltk/parse/__pycache__/generate.cpython-37.pyc
nlp_resource_data/nltk/parse/__pycache__/malt.cpython-37.pyc
nlp_resource_data/nltk/parse/__pycache__/nonprojectivedependencyparser.cpython-37.pyc
nlp_resource_data/nltk/parse/__pycache__/pchart.cpython-37.pyc
nlp_resource_data/nltk/parse/__pycache__/projectivedependencyparser.cpython-37.pyc
nlp_resource_data/nltk/parse/__pycache__/recursivedescent.cpython-37.pyc
nlp_resource_data/nltk/parse/__pycache__/shiftreduce.cpython-37.pyc
nlp_resource_data/nltk/parse/__pycache__/stanford.cpython-37.pyc
nlp_resource_data/nltk/parse/__pycache__/transitionparser.cpython-37.pyc
nlp_resource_data/nltk/parse/__pycache__/util.cpython-37.pyc
nlp_resource_data/nltk/parse/__pycache__/viterbi.cpython-37.pyc
nlp_resource_data/nltk/parse/api.py
nlp_resource_data/nltk/parse/bllip.py
nlp_resource_data/nltk/parse/chart.py
nlp_resource_data/nltk/parse/corenlp.py
nlp_resource_data/nltk/parse/dependencygraph.py
nlp_resource_data/nltk/parse/earleychart.py
nlp_resource_data/nltk/parse/evaluate.py
nlp_resource_data/nltk/parse/featurechart.py
nlp_resource_data/nltk/parse/generate.py
nlp_resource_data/nltk/parse/malt.py
nlp_resource_data/nltk/parse/nonprojectivedependencyparser.py
nlp_resource_data/nltk/parse/pchart.py
nlp_resource_data/nltk/parse/projectivedependencyparser.py
nlp_resource_data/nltk/parse/recursivedescent.py
nlp_resource_data/nltk/parse/shiftreduce.py
nlp_resource_data/nltk/parse/stanford.py
nlp_resource_data/nltk/parse/transitionparser.py
nlp_resource_data/nltk/parse/util.py
nlp_resource_data/nltk/parse/viterbi.py
nlp_resource_data/nltk/probability.py
nlp_resource_data/nltk/sem/__init__.py
nlp_resource_data/nltk/sem/__pycache__/__init__.cpython-37.pyc
nlp_resource_data/nltk/sem/__pycache__/boxer.cpython-37.pyc
nlp_resource_data/nltk/sem/__pycache__/chat80.cpython-37.pyc
nlp_resource_data/nltk/sem/__pycache__/cooper_storage.cpython-37.pyc
nlp_resource_data/nltk/sem/__pycache__/drt.cpython-37.pyc
nlp_resource_data/nltk/sem/__pycache__/drt_glue_demo.cpython-37.pyc
nlp_resource_data/nltk/sem/__pycache__/evaluate.cpython-37.pyc
nlp_resource_data/nltk/sem/__pycache__/glue.cpython-37.pyc
nlp_resource_data/nltk/sem/__pycache__/hole.cpython-37.pyc
nlp_resource_data/nltk/sem/__pycache__/lfg.cpython-37.pyc
nlp_resource_data/nltk/sem/__pycache__/linearlogic.cpython-37.pyc
nlp_resource_data/nltk/sem/__pycache__/logic.cpython-37.pyc
nlp_resource_data/nltk/sem/__pycache__/relextract.cpython-37.pyc
nlp_resource_data/nltk/sem/__pycache__/skolemize.cpython-37.pyc
nlp_resource_data/nltk/sem/__pycache__/util.cpython-37.pyc
nlp_resource_data/nltk/sem/boxer.py
nlp_resource_data/nltk/sem/chat80.py
nlp_resource_data/nltk/sem/cooper_storage.py
nlp_resource_data/nltk/sem/drt.py
nlp_resource_data/nltk/sem/drt_glue_demo.py
nlp_resource_data/nltk/sem/evaluate.py
nlp_resource_data/nltk/sem/glue.py
nlp_resource_data/nltk/sem/hole.py
nlp_resource_data/nltk/sem/lfg.py
nlp_resource_data/nltk/sem/linearlogic.py
nlp_resource_data/nltk/sem/logic.py
nlp_resource_data/nltk/sem/relextract.py
nlp_resource_data/nltk/sem/skolemize.py
nlp_resource_data/nltk/sem/util.py
nlp_resource_data/nltk/sentiment/__init__.py
nlp_resource_data/nltk/sentiment/__pycache__/__init__.cpython-37.pyc
nlp_resource_data/nltk/sentiment/__pycache__/sentiment_analyzer.cpython-37.pyc
nlp_resource_data/nltk/sentiment/__pycache__/util.cpython-37.pyc
nlp_resource_data/nltk/sentiment/__pycache__/vader.cpython-37.pyc
nlp_resource_data/nltk/sentiment/sentiment_analyzer.py
nlp_resource_data/nltk/sentiment/util.py
nlp_resource_data/nltk/sentiment/vader.py
nlp_resource_data/nltk/stem/__init__.py
nlp_resource_data/nltk/stem/__pycache__/__init__.cpython-37.pyc
nlp_resource_data/nltk/stem/__pycache__/api.cpython-37.pyc
nlp_resource_data/nltk/stem/__pycache__/arlstem.cpython-37.pyc
nlp_resource_data/nltk/stem/__pycache__/cistem.cpython-37.pyc
nlp_resource_data/nltk/stem/__pycache__/isri.cpython-37.pyc
nlp_resource_data/nltk/stem/__pycache__/lancaster.cpython-37.pyc
nlp_resource_data/nltk/stem/__pycache__/porter.cpython-37.pyc
nlp_resource_data/nltk/stem/__pycache__/regexp.cpython-37.pyc
nlp_resource_data/nltk/stem/__pycache__/rslp.cpython-37.pyc
nlp_resource_data/nltk/stem/__pycache__/snowball.cpython-37.pyc
nlp_resource_data/nltk/stem/__pycache__/util.cpython-37.pyc
nlp_resource_data/nltk/stem/__pycache__/wordnet.cpython-37.pyc
nlp_resource_data/nltk/stem/api.py
nlp_resource_data/nltk/stem/arlstem.py
nlp_resource_data/nltk/stem/cistem.py
nlp_resource_data/nltk/stem/isri.py
nlp_resource_data/nltk/stem/lancaster.py
nlp_resource_data/nltk/stem/porter.py
nlp_resource_data/nltk/stem/regexp.py
nlp_resource_data/nltk/stem/rslp.py
nlp_resource_data/nltk/stem/snowball.py
nlp_resource_data/nltk/stem/util.py
nlp_resource_data/nltk/stem/wordnet.py
nlp_resource_data/nltk/tag/__init__.py
nlp_resource_data/nltk/tag/__pycache__/__init__.cpython-37.pyc
nlp_resource_data/nltk/tag/__pycache__/api.cpython-37.pyc
nlp_resource_data/nltk/tag/__pycache__/brill.cpython-37.pyc
nlp_resource_data/nltk/tag/__pycache__/brill_trainer.cpython-37.pyc
nlp_resource_data/nltk/tag/__pycache__/crf.cpython-37.pyc
nlp_resource_data/nltk/tag/__pycache__/hmm.cpython-37.pyc
nlp_resource_data/nltk/tag/__pycache__/hunpos.cpython-37.pyc
nlp_resource_data/nltk/tag/__pycache__/mapping.cpython-37.pyc
nlp_resource_data/nltk/tag/__pycache__/perceptron.cpython-37.pyc
nlp_resource_data/nltk/tag/__pycache__/senna.cpython-37.pyc
nlp_resource_data/nltk/tag/__pycache__/sequential.cpython-37.pyc
nlp_resource_data/nltk/tag/__pycache__/stanford.cpython-37.pyc
nlp_resource_data/nltk/tag/__pycache__/tnt.cpython-37.pyc
nlp_resource_data/nltk/tag/__pycache__/util.cpython-37.pyc
nlp_resource_data/nltk/tag/api.py
nlp_resource_data/nltk/tag/brill.py
nlp_resource_data/nltk/tag/brill_trainer.py
nlp_resource_data/nltk/tag/crf.py
nlp_resource_data/nltk/tag/hmm.py
nlp_resource_data/nltk/tag/hunpos.py
nlp_resource_data/nltk/tag/mapping.py
nlp_resource_data/nltk/tag/perceptron.py
nlp_resource_data/nltk/tag/senna.py
nlp_resource_data/nltk/tag/sequential.py
nlp_resource_data/nltk/tag/stanford.py
nlp_resource_data/nltk/tag/tnt.py
nlp_resource_data/nltk/tag/util.py
nlp_resource_data/nltk/tbl/__init__.py
nlp_resource_data/nltk/tbl/__pycache__/__init__.cpython-37.pyc
nlp_resource_data/nltk/tbl/__pycache__/api.cpython-37.pyc
nlp_resource_data/nltk/tbl/__pycache__/demo.cpython-37.pyc
nlp_resource_data/nltk/tbl/__pycache__/erroranalysis.cpython-37.pyc
nlp_resource_data/nltk/tbl/__pycache__/feature.cpython-37.pyc
nlp_resource_data/nltk/tbl/__pycache__/rule.cpython-37.pyc
nlp_resource_data/nltk/tbl/__pycache__/template.cpython-37.pyc
nlp_resource_data/nltk/tbl/demo.py
nlp_resource_data/nltk/tbl/erroranalysis.py
nlp_resource_data/nltk/tbl/feature.py
nlp_resource_data/nltk/tbl/rule.py
nlp_resource_data/nltk/tbl/template.py
nlp_resource_data/nltk/test/__init__.py
nlp_resource_data/nltk/test/__pycache__/__init__.cpython-37.pyc
nlp_resource_data/nltk/test/__pycache__/all.cpython-37.pyc
nlp_resource_data/nltk/test/__pycache__/childes_fixt.cpython-37.pyc
nlp_resource_data/nltk/test/__pycache__/classify_fixt.cpython-37.pyc
nlp_resource_data/nltk/test/__pycache__/compat_fixt.cpython-37.pyc [new file with mode: 0644]
nlp_resource_data/nltk/test/__pycache__/corpus_fixt.cpython-37.pyc
nlp_resource_data/nltk/test/__pycache__/discourse_fixt.cpython-37.pyc
nlp_resource_data/nltk/test/__pycache__/doctest_nose_plugin.cpython-37.pyc [new file with mode: 0644]
nlp_resource_data/nltk/test/__pycache__/gensim_fixt.cpython-37.pyc
nlp_resource_data/nltk/test/__pycache__/gluesemantics_malt_fixt.cpython-37.pyc
nlp_resource_data/nltk/test/__pycache__/inference_fixt.cpython-37.pyc
nlp_resource_data/nltk/test/__pycache__/nonmonotonic_fixt.cpython-37.pyc
nlp_resource_data/nltk/test/__pycache__/portuguese_en_fixt.cpython-37.pyc
nlp_resource_data/nltk/test/__pycache__/probability_fixt.cpython-37.pyc
nlp_resource_data/nltk/test/__pycache__/runtests.cpython-37.pyc
nlp_resource_data/nltk/test/__pycache__/segmentation_fixt.cpython-37.pyc
nlp_resource_data/nltk/test/__pycache__/semantics_fixt.cpython-37.pyc
nlp_resource_data/nltk/test/__pycache__/translate_fixt.cpython-37.pyc
nlp_resource_data/nltk/test/__pycache__/wordnet_fixt.cpython-37.pyc
nlp_resource_data/nltk/test/all.py
nlp_resource_data/nltk/test/bnc.doctest
nlp_resource_data/nltk/test/ccg.doctest
nlp_resource_data/nltk/test/ccg_semantics.doctest
nlp_resource_data/nltk/test/chat80.doctest
nlp_resource_data/nltk/test/childes_fixt.py
nlp_resource_data/nltk/test/chunk.doctest
nlp_resource_data/nltk/test/classify.doctest
nlp_resource_data/nltk/test/classify_fixt.py
nlp_resource_data/nltk/test/collections.doctest
nlp_resource_data/nltk/test/collocations.doctest
nlp_resource_data/nltk/test/compat.doctest [new file with mode: 0644]
nlp_resource_data/nltk/test/compat_fixt.py [new file with mode: 0644]
nlp_resource_data/nltk/test/corpus.doctest
nlp_resource_data/nltk/test/corpus_fixt.py
nlp_resource_data/nltk/test/crubadan.doctest
nlp_resource_data/nltk/test/data.doctest
nlp_resource_data/nltk/test/dependency.doctest
nlp_resource_data/nltk/test/discourse.doctest
nlp_resource_data/nltk/test/discourse_fixt.py
nlp_resource_data/nltk/test/doctest_nose_plugin.py [new file with mode: 0644]
nlp_resource_data/nltk/test/drt.doctest
nlp_resource_data/nltk/test/featgram.doctest
nlp_resource_data/nltk/test/featstruct.doctest
nlp_resource_data/nltk/test/framenet.doctest
nlp_resource_data/nltk/test/generate.doctest
nlp_resource_data/nltk/test/gensim.doctest
nlp_resource_data/nltk/test/gensim_fixt.py
nlp_resource_data/nltk/test/gluesemantics.doctest
nlp_resource_data/nltk/test/gluesemantics_malt.doctest
nlp_resource_data/nltk/test/gluesemantics_malt_fixt.py
nlp_resource_data/nltk/test/grammar.doctest
nlp_resource_data/nltk/test/grammartestsuites.doctest
nlp_resource_data/nltk/test/index.doctest
nlp_resource_data/nltk/test/inference.doctest
nlp_resource_data/nltk/test/inference_fixt.py
nlp_resource_data/nltk/test/internals.doctest
nlp_resource_data/nltk/test/japanese.doctest
nlp_resource_data/nltk/test/lm.doctest
nlp_resource_data/nltk/test/logic.doctest
nlp_resource_data/nltk/test/meteor.doctest [deleted file]
nlp_resource_data/nltk/test/metrics.doctest
nlp_resource_data/nltk/test/misc.doctest
nlp_resource_data/nltk/test/nonmonotonic.doctest
nlp_resource_data/nltk/test/nonmonotonic_fixt.py
nlp_resource_data/nltk/test/parse.doctest
nlp_resource_data/nltk/test/portuguese_en.doctest
nlp_resource_data/nltk/test/portuguese_en_fixt.py
nlp_resource_data/nltk/test/probability.doctest
nlp_resource_data/nltk/test/probability_fixt.py
nlp_resource_data/nltk/test/propbank.doctest
nlp_resource_data/nltk/test/relextract.doctest
nlp_resource_data/nltk/test/resolution.doctest
nlp_resource_data/nltk/test/runtests.py
nlp_resource_data/nltk/test/segmentation_fixt.py
nlp_resource_data/nltk/test/semantics.doctest
nlp_resource_data/nltk/test/semantics_fixt.py
nlp_resource_data/nltk/test/sentiment.doctest
nlp_resource_data/nltk/test/sentiwordnet.doctest
nlp_resource_data/nltk/test/simple.doctest
nlp_resource_data/nltk/test/stem.doctest
nlp_resource_data/nltk/test/tag.doctest
nlp_resource_data/nltk/test/tokenize.doctest
nlp_resource_data/nltk/test/toolbox.doctest
nlp_resource_data/nltk/test/translate.doctest
nlp_resource_data/nltk/test/translate_fixt.py
nlp_resource_data/nltk/test/tree.doctest
nlp_resource_data/nltk/test/treeprettyprinter.doctest
nlp_resource_data/nltk/test/treetransforms.doctest
nlp_resource_data/nltk/test/unit/__pycache__/__init__.cpython-37.pyc
nlp_resource_data/nltk/test/unit/__pycache__/test_2x_compat.cpython-37.pyc [new file with mode: 0644]
nlp_resource_data/nltk/test/unit/__pycache__/test_aline.cpython-37.pyc
nlp_resource_data/nltk/test/unit/__pycache__/test_brill.cpython-37.pyc
nlp_resource_data/nltk/test/unit/__pycache__/test_cfd_mutation.cpython-37.pyc [deleted file]
nlp_resource_data/nltk/test/unit/__pycache__/test_cfg2chomsky.cpython-37.pyc [deleted file]
nlp_resource_data/nltk/test/unit/__pycache__/test_chunk.cpython-37.pyc
nlp_resource_data/nltk/test/unit/__pycache__/test_classify.cpython-37.pyc
nlp_resource_data/nltk/test/unit/__pycache__/test_collocations.cpython-37.pyc
nlp_resource_data/nltk/test/unit/__pycache__/test_concordance.cpython-37.pyc
nlp_resource_data/nltk/test/unit/__pycache__/test_corenlp.cpython-37.pyc
nlp_resource_data/nltk/test/unit/__pycache__/test_corpora.cpython-37.pyc
nlp_resource_data/nltk/test/unit/__pycache__/test_corpus_views.cpython-37.pyc
nlp_resource_data/nltk/test/unit/__pycache__/test_data.cpython-37.pyc
nlp_resource_data/nltk/test/unit/__pycache__/test_disagreement.cpython-37.pyc
nlp_resource_data/nltk/test/unit/__pycache__/test_freqdist.cpython-37.pyc [deleted file]
nlp_resource_data/nltk/test/unit/__pycache__/test_hmm.cpython-37.pyc
nlp_resource_data/nltk/test/unit/__pycache__/test_json2csv_corpus.cpython-37.pyc
nlp_resource_data/nltk/test/unit/__pycache__/test_json_serialization.cpython-37.pyc [deleted file]
nlp_resource_data/nltk/test/unit/__pycache__/test_naivebayes.cpython-37.pyc
nlp_resource_data/nltk/test/unit/__pycache__/test_nombank.cpython-37.pyc [deleted file]
nlp_resource_data/nltk/test/unit/__pycache__/test_pl196x.cpython-37.pyc [deleted file]
nlp_resource_data/nltk/test/unit/__pycache__/test_pos_tag.cpython-37.pyc
nlp_resource_data/nltk/test/unit/__pycache__/test_rte_classify.cpython-37.pyc
nlp_resource_data/nltk/test/unit/__pycache__/test_seekable_unicode_stream_reader.cpython-37.pyc
nlp_resource_data/nltk/test/unit/__pycache__/test_senna.cpython-37.pyc
nlp_resource_data/nltk/test/unit/__pycache__/test_stem.cpython-37.pyc
nlp_resource_data/nltk/test/unit/__pycache__/test_tag.cpython-37.pyc
nlp_resource_data/nltk/test/unit/__pycache__/test_tgrep.cpython-37.pyc
nlp_resource_data/nltk/test/unit/__pycache__/test_tokenize.cpython-37.pyc
nlp_resource_data/nltk/test/unit/__pycache__/test_twitter_auth.cpython-37.pyc
nlp_resource_data/nltk/test/unit/__pycache__/test_wordnet.cpython-37.pyc
nlp_resource_data/nltk/test/unit/__pycache__/utils.cpython-37.pyc
nlp_resource_data/nltk/test/unit/lm/__pycache__/__init__.cpython-37.pyc
nlp_resource_data/nltk/test/unit/lm/__pycache__/test_counter.cpython-37.pyc
nlp_resource_data/nltk/test/unit/lm/__pycache__/test_models.cpython-37.pyc
nlp_resource_data/nltk/test/unit/lm/__pycache__/test_preprocessing.cpython-37.pyc
nlp_resource_data/nltk/test/unit/lm/__pycache__/test_vocabulary.cpython-37.pyc
nlp_resource_data/nltk/test/unit/lm/test_counter.py
nlp_resource_data/nltk/test/unit/lm/test_models.py
nlp_resource_data/nltk/test/unit/lm/test_preprocessing.py
nlp_resource_data/nltk/test/unit/lm/test_vocabulary.py
nlp_resource_data/nltk/test/unit/test_2x_compat.py [new file with mode: 0644]
nlp_resource_data/nltk/test/unit/test_aline.py
nlp_resource_data/nltk/test/unit/test_cfd_mutation.py [deleted file]
nlp_resource_data/nltk/test/unit/test_cfg2chomsky.py [deleted file]
nlp_resource_data/nltk/test/unit/test_chunk.py
nlp_resource_data/nltk/test/unit/test_classify.py
nlp_resource_data/nltk/test/unit/test_collocations.py
nlp_resource_data/nltk/test/unit/test_concordance.py
nlp_resource_data/nltk/test/unit/test_corenlp.py
nlp_resource_data/nltk/test/unit/test_corpora.py
nlp_resource_data/nltk/test/unit/test_corpus_views.py
nlp_resource_data/nltk/test/unit/test_disagreement.py
nlp_resource_data/nltk/test/unit/test_freqdist.py [deleted file]
nlp_resource_data/nltk/test/unit/test_hmm.py
nlp_resource_data/nltk/test/unit/test_json2csv_corpus.py
nlp_resource_data/nltk/test/unit/test_json_serialization.py [deleted file]
nlp_resource_data/nltk/test/unit/test_naivebayes.py
nlp_resource_data/nltk/test/unit/test_nombank.py [deleted file]
nlp_resource_data/nltk/test/unit/test_pl196x.py [deleted file]
nlp_resource_data/nltk/test/unit/test_pos_tag.py
nlp_resource_data/nltk/test/unit/test_rte_classify.py
nlp_resource_data/nltk/test/unit/test_seekable_unicode_stream_reader.py
nlp_resource_data/nltk/test/unit/test_senna.py
nlp_resource_data/nltk/test/unit/test_stem.py
nlp_resource_data/nltk/test/unit/test_tag.py
nlp_resource_data/nltk/test/unit/test_tgrep.py
nlp_resource_data/nltk/test/unit/test_tokenize.py
nlp_resource_data/nltk/test/unit/test_wordnet.py
nlp_resource_data/nltk/test/unit/translate/__pycache__/__init__.cpython-37.pyc
nlp_resource_data/nltk/test/unit/translate/__pycache__/test_bleu.cpython-37.pyc
nlp_resource_data/nltk/test/unit/translate/__pycache__/test_gdfa.cpython-37.pyc
nlp_resource_data/nltk/test/unit/translate/__pycache__/test_ibm1.cpython-37.pyc
nlp_resource_data/nltk/test/unit/translate/__pycache__/test_ibm2.cpython-37.pyc
nlp_resource_data/nltk/test/unit/translate/__pycache__/test_ibm3.cpython-37.pyc
nlp_resource_data/nltk/test/unit/translate/__pycache__/test_ibm4.cpython-37.pyc
nlp_resource_data/nltk/test/unit/translate/__pycache__/test_ibm5.cpython-37.pyc
nlp_resource_data/nltk/test/unit/translate/__pycache__/test_ibm_model.cpython-37.pyc
nlp_resource_data/nltk/test/unit/translate/__pycache__/test_nist.cpython-37.pyc
nlp_resource_data/nltk/test/unit/translate/__pycache__/test_stack_decoder.cpython-37.pyc
nlp_resource_data/nltk/test/unit/translate/test_stack_decoder.py
nlp_resource_data/nltk/test/unit/utils.py
nlp_resource_data/nltk/test/util.doctest
nlp_resource_data/nltk/test/wordnet.doctest
nlp_resource_data/nltk/test/wordnet_fixt.py
nlp_resource_data/nltk/test/wordnet_lch.doctest
nlp_resource_data/nltk/test/wsd.doctest
nlp_resource_data/nltk/text.py
nlp_resource_data/nltk/tgrep.py
nlp_resource_data/nltk/tokenize/__init__.py
nlp_resource_data/nltk/tokenize/__pycache__/__init__.cpython-37.pyc
nlp_resource_data/nltk/tokenize/__pycache__/api.cpython-37.pyc
nlp_resource_data/nltk/tokenize/__pycache__/casual.cpython-37.pyc
nlp_resource_data/nltk/tokenize/__pycache__/destructive.cpython-37.pyc [deleted file]
nlp_resource_data/nltk/tokenize/__pycache__/mwe.cpython-37.pyc
nlp_resource_data/nltk/tokenize/__pycache__/nist.cpython-37.pyc
nlp_resource_data/nltk/tokenize/__pycache__/punkt.cpython-37.pyc
nlp_resource_data/nltk/tokenize/__pycache__/regexp.cpython-37.pyc
nlp_resource_data/nltk/tokenize/__pycache__/repp.cpython-37.pyc
nlp_resource_data/nltk/tokenize/__pycache__/sexpr.cpython-37.pyc
nlp_resource_data/nltk/tokenize/__pycache__/simple.cpython-37.pyc
nlp_resource_data/nltk/tokenize/__pycache__/sonority_sequencing.cpython-37.pyc [deleted file]
nlp_resource_data/nltk/tokenize/__pycache__/stanford.cpython-37.pyc
nlp_resource_data/nltk/tokenize/__pycache__/stanford_segmenter.cpython-37.pyc
nlp_resource_data/nltk/tokenize/__pycache__/texttiling.cpython-37.pyc
nlp_resource_data/nltk/tokenize/__pycache__/toktok.cpython-37.pyc
nlp_resource_data/nltk/tokenize/__pycache__/treebank.cpython-37.pyc
nlp_resource_data/nltk/tokenize/__pycache__/util.cpython-37.pyc
nlp_resource_data/nltk/tokenize/api.py
nlp_resource_data/nltk/tokenize/casual.py
nlp_resource_data/nltk/tokenize/destructive.py [deleted file]
nlp_resource_data/nltk/tokenize/mwe.py
nlp_resource_data/nltk/tokenize/nist.py
nlp_resource_data/nltk/tokenize/punkt.py
nlp_resource_data/nltk/tokenize/regexp.py
nlp_resource_data/nltk/tokenize/repp.py
nlp_resource_data/nltk/tokenize/sexpr.py
nlp_resource_data/nltk/tokenize/simple.py
nlp_resource_data/nltk/tokenize/sonority_sequencing.py [deleted file]
nlp_resource_data/nltk/tokenize/stanford.py
nlp_resource_data/nltk/tokenize/stanford_segmenter.py
nlp_resource_data/nltk/tokenize/texttiling.py
nlp_resource_data/nltk/tokenize/toktok.py
nlp_resource_data/nltk/tokenize/treebank.py
nlp_resource_data/nltk/tokenize/util.py
nlp_resource_data/nltk/toolbox.py
nlp_resource_data/nltk/translate/__init__.py
nlp_resource_data/nltk/translate/__pycache__/__init__.cpython-37.pyc
nlp_resource_data/nltk/translate/__pycache__/api.cpython-37.pyc
nlp_resource_data/nltk/translate/__pycache__/bleu_score.cpython-37.pyc
nlp_resource_data/nltk/translate/__pycache__/chrf_score.cpython-37.pyc
nlp_resource_data/nltk/translate/__pycache__/gale_church.cpython-37.pyc
nlp_resource_data/nltk/translate/__pycache__/gdfa.cpython-37.pyc
nlp_resource_data/nltk/translate/__pycache__/gleu_score.cpython-37.pyc
nlp_resource_data/nltk/translate/__pycache__/ibm1.cpython-37.pyc
nlp_resource_data/nltk/translate/__pycache__/ibm2.cpython-37.pyc
nlp_resource_data/nltk/translate/__pycache__/ibm3.cpython-37.pyc
nlp_resource_data/nltk/translate/__pycache__/ibm4.cpython-37.pyc
nlp_resource_data/nltk/translate/__pycache__/ibm5.cpython-37.pyc
nlp_resource_data/nltk/translate/__pycache__/ibm_model.cpython-37.pyc
nlp_resource_data/nltk/translate/__pycache__/meteor_score.cpython-37.pyc [deleted file]
nlp_resource_data/nltk/translate/__pycache__/metrics.cpython-37.pyc
nlp_resource_data/nltk/translate/__pycache__/nist_score.cpython-37.pyc
nlp_resource_data/nltk/translate/__pycache__/phrase_based.cpython-37.pyc
nlp_resource_data/nltk/translate/__pycache__/ribes_score.cpython-37.pyc
nlp_resource_data/nltk/translate/__pycache__/stack_decoder.cpython-37.pyc
nlp_resource_data/nltk/translate/api.py
nlp_resource_data/nltk/translate/bleu_score.py
nlp_resource_data/nltk/translate/chrf_score.py
nlp_resource_data/nltk/translate/gale_church.py
nlp_resource_data/nltk/translate/gdfa.py
nlp_resource_data/nltk/translate/gleu_score.py
nlp_resource_data/nltk/translate/ibm1.py
nlp_resource_data/nltk/translate/ibm2.py
nlp_resource_data/nltk/translate/ibm3.py
nlp_resource_data/nltk/translate/ibm4.py
nlp_resource_data/nltk/translate/ibm5.py
nlp_resource_data/nltk/translate/ibm_model.py
nlp_resource_data/nltk/translate/meteor_score.py [deleted file]
nlp_resource_data/nltk/translate/metrics.py
nlp_resource_data/nltk/translate/nist_score.py
nlp_resource_data/nltk/translate/phrase_based.py
nlp_resource_data/nltk/translate/ribes_score.py
nlp_resource_data/nltk/translate/stack_decoder.py
nlp_resource_data/nltk/tree.py
nlp_resource_data/nltk/treeprettyprinter.py
nlp_resource_data/nltk/treetransforms.py
nlp_resource_data/nltk/twitter/__init__.py
nlp_resource_data/nltk/twitter/__pycache__/__init__.cpython-37.pyc
nlp_resource_data/nltk/twitter/__pycache__/api.cpython-37.pyc
nlp_resource_data/nltk/twitter/__pycache__/common.cpython-37.pyc
nlp_resource_data/nltk/twitter/__pycache__/twitter_demo.cpython-37.pyc
nlp_resource_data/nltk/twitter/__pycache__/twitterclient.cpython-37.pyc
nlp_resource_data/nltk/twitter/__pycache__/util.cpython-37.pyc
nlp_resource_data/nltk/twitter/api.py
nlp_resource_data/nltk/twitter/common.py
nlp_resource_data/nltk/twitter/twitter_demo.py
nlp_resource_data/nltk/twitter/twitterclient.py
nlp_resource_data/nltk/twitter/util.py
nlp_resource_data/nltk/util.py
nlp_resource_data/nltk/wsd.py

index 32833cb..cd14254 100644 (file)
@@ -1,6 +1,6 @@
 # Natural Language Toolkit (NLTK)
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Authors: Steven Bird <stevenbird1@gmail.com>
 #          Edward Loper <edloper@gmail.com>
 # URL: <http://nltk.org/>
@@ -15,6 +15,7 @@ Steven Bird, Ewan Klein, and Edward Loper (2009).
 Natural Language Processing with Python.  O'Reilly Media Inc.
 http://nltk.org/book
 """
+from __future__ import print_function, absolute_import
 
 import os
 
@@ -26,21 +27,21 @@ import os
 # in the file VERSION.
 try:
     # If a VERSION file exists, use it!
-    version_file = os.path.join(os.path.dirname(__file__), "VERSION")
-    with open(version_file, "r") as infile:
+    version_file = os.path.join(os.path.dirname(__file__), 'VERSION')
+    with open(version_file, 'r') as infile:
         __version__ = infile.read().strip()
 except NameError:
-    __version__ = "unknown (running code interactively?)"
+    __version__ = 'unknown (running code interactively?)'
 except IOError as ex:
     __version__ = "unknown (%s)" % ex
 
 if __doc__ is not None:  # fix for the ``python -OO``
-    __doc__ += "\n@version: " + __version__
+    __doc__ += '\n@version: ' + __version__
 
 
 # Copyright notice
 __copyright__ = """\
-Copyright (C) 2001-2020 NLTK Project.
+Copyright (C) 2001-2019 NLTK Project.
 
 Distributed and Licensed under the Apache License, Version 2.0,
 which is included by reference.
@@ -52,18 +53,18 @@ __longdescr__ = """\
 The Natural Language Toolkit (NLTK) is a Python package for
 natural language processing.  NLTK requires Python 2.6 or higher."""
 __keywords__ = [
-    "NLP",
-    "CL",
-    "natural language processing",
-    "computational linguistics",
-    "parsing",
-    "tagging",
-    "tokenizing",
-    "syntax",
-    "linguistics",
-    "language",
-    "natural language",
-    "text analytics",
+    'NLP',
+    'CL',
+    'natural language processing',
+    'computational linguistics',
+    'parsing',
+    'tagging',
+    'tokenizing',
+    'syntax',
+    'linguistics',
+    'language',
+    'natural language',
+    'text analytics',
 ]
 __url__ = "http://nltk.org/"
 
@@ -75,24 +76,24 @@ __author_email__ = __maintainer_email__
 
 # "Trove" classifiers for Python Package Index.
 __classifiers__ = [
-    "Development Status :: 5 - Production/Stable",
-    "Intended Audience :: Developers",
-    "Intended Audience :: Education",
-    "Intended Audience :: Information Technology",
-    "Intended Audience :: Science/Research",
-    "License :: OSI Approved :: Apache Software License",
-    "Operating System :: OS Independent",
-    "Programming Language :: Python :: 2.6",
-    "Programming Language :: Python :: 2.7",
-    "Topic :: Scientific/Engineering",
-    "Topic :: Scientific/Engineering :: Artificial Intelligence",
-    "Topic :: Scientific/Engineering :: Human Machine Interfaces",
-    "Topic :: Scientific/Engineering :: Information Analysis",
-    "Topic :: Text Processing",
-    "Topic :: Text Processing :: Filters",
-    "Topic :: Text Processing :: General",
-    "Topic :: Text Processing :: Indexing",
-    "Topic :: Text Processing :: Linguistic",
+    'Development Status :: 5 - Production/Stable',
+    'Intended Audience :: Developers',
+    'Intended Audience :: Education',
+    'Intended Audience :: Information Technology',
+    'Intended Audience :: Science/Research',
+    'License :: OSI Approved :: Apache Software License',
+    'Operating System :: OS Independent',
+    'Programming Language :: Python :: 2.6',
+    'Programming Language :: Python :: 2.7',
+    'Topic :: Scientific/Engineering',
+    'Topic :: Scientific/Engineering :: Artificial Intelligence',
+    'Topic :: Scientific/Engineering :: Human Machine Interfaces',
+    'Topic :: Scientific/Engineering :: Information Analysis',
+    'Topic :: Text Processing',
+    'Topic :: Text Processing :: Filters',
+    'Topic :: Text Processing :: General',
+    'Topic :: Text Processing :: Indexing',
+    'Topic :: Text Processing :: Linguistic',
 ]
 
 from nltk.internals import config_java
@@ -106,16 +107,16 @@ except ImportError:
 # Override missing methods on environments where it cannot be used like GAE.
 import subprocess
 
-if not hasattr(subprocess, "PIPE"):
+if not hasattr(subprocess, 'PIPE'):
 
     def _fake_PIPE(*args, **kwargs):
-        raise NotImplementedError("subprocess.PIPE is not supported.")
+        raise NotImplementedError('subprocess.PIPE is not supported.')
 
     subprocess.PIPE = _fake_PIPE
-if not hasattr(subprocess, "Popen"):
+if not hasattr(subprocess, 'Popen'):
 
     def _fake_Popen(*args, **kwargs):
-        raise NotImplementedError("subprocess.Popen is not supported.")
+        raise NotImplementedError('subprocess.Popen is not supported.')
 
     subprocess.Popen = _fake_Popen
 
@@ -157,11 +158,11 @@ from nltk.stem import *
 
 from nltk import lazyimport
 
-app = lazyimport.LazyModule("nltk.app", locals(), globals())
-chat = lazyimport.LazyModule("nltk.chat", locals(), globals())
-corpus = lazyimport.LazyModule("nltk.corpus", locals(), globals())
-draw = lazyimport.LazyModule("nltk.draw", locals(), globals())
-toolbox = lazyimport.LazyModule("nltk.toolbox", locals(), globals())
+app = lazyimport.LazyModule('nltk.app', locals(), globals())
+chat = lazyimport.LazyModule('nltk.chat', locals(), globals())
+corpus = lazyimport.LazyModule('nltk.corpus', locals(), globals())
+draw = lazyimport.LazyModule('nltk.draw', locals(), globals())
+toolbox = lazyimport.LazyModule('nltk.toolbox', locals(), globals())
 
 # Optional loading
 
@@ -175,7 +176,7 @@ else:
 from nltk.downloader import download, download_shell
 
 try:
-    import tkinter
+    from six.moves import tkinter
 except ImportError:
     pass
 else:
index 625f133..a6e7071 100644 (file)
Binary files a/nlp_resource_data/nltk/__pycache__/__init__.cpython-37.pyc and b/nlp_resource_data/nltk/__pycache__/__init__.cpython-37.pyc differ
index 7eb915b..1fa7da6 100644 (file)
Binary files a/nlp_resource_data/nltk/__pycache__/book.cpython-37.pyc and b/nlp_resource_data/nltk/__pycache__/book.cpython-37.pyc differ
diff --git a/nlp_resource_data/nltk/__pycache__/cli.cpython-37.pyc b/nlp_resource_data/nltk/__pycache__/cli.cpython-37.pyc
deleted file mode 100644 (file)
index 0848537..0000000
Binary files a/nlp_resource_data/nltk/__pycache__/cli.cpython-37.pyc and /dev/null differ
index 54cc532..d7f7d19 100644 (file)
Binary files a/nlp_resource_data/nltk/__pycache__/collections.cpython-37.pyc and b/nlp_resource_data/nltk/__pycache__/collections.cpython-37.pyc differ
index 85a13f8..07790ee 100644 (file)
Binary files a/nlp_resource_data/nltk/__pycache__/collocations.cpython-37.pyc and b/nlp_resource_data/nltk/__pycache__/collocations.cpython-37.pyc differ
index a8163a3..81821ed 100644 (file)
Binary files a/nlp_resource_data/nltk/__pycache__/compat.cpython-37.pyc and b/nlp_resource_data/nltk/__pycache__/compat.cpython-37.pyc differ
index a35d1c7..55a3378 100644 (file)
Binary files a/nlp_resource_data/nltk/__pycache__/data.cpython-37.pyc and b/nlp_resource_data/nltk/__pycache__/data.cpython-37.pyc differ
index 0592dc3..35fd2fc 100644 (file)
Binary files a/nlp_resource_data/nltk/__pycache__/decorators.cpython-37.pyc and b/nlp_resource_data/nltk/__pycache__/decorators.cpython-37.pyc differ
index 5646828..ca73501 100644 (file)
Binary files a/nlp_resource_data/nltk/__pycache__/downloader.cpython-37.pyc and b/nlp_resource_data/nltk/__pycache__/downloader.cpython-37.pyc differ
index 9f2b636..5dbfbd8 100644 (file)
Binary files a/nlp_resource_data/nltk/__pycache__/featstruct.cpython-37.pyc and b/nlp_resource_data/nltk/__pycache__/featstruct.cpython-37.pyc differ
index 22d6037..60ce228 100644 (file)
Binary files a/nlp_resource_data/nltk/__pycache__/grammar.cpython-37.pyc and b/nlp_resource_data/nltk/__pycache__/grammar.cpython-37.pyc differ
index 7f307fd..755ca73 100644 (file)
Binary files a/nlp_resource_data/nltk/__pycache__/help.cpython-37.pyc and b/nlp_resource_data/nltk/__pycache__/help.cpython-37.pyc differ
index af20c90..3992005 100644 (file)
Binary files a/nlp_resource_data/nltk/__pycache__/internals.cpython-37.pyc and b/nlp_resource_data/nltk/__pycache__/internals.cpython-37.pyc differ
index 7cecde8..87cc6bd 100644 (file)
Binary files a/nlp_resource_data/nltk/__pycache__/jsontags.cpython-37.pyc and b/nlp_resource_data/nltk/__pycache__/jsontags.cpython-37.pyc differ
index 8d4bfbd..20c322e 100644 (file)
Binary files a/nlp_resource_data/nltk/__pycache__/lazyimport.cpython-37.pyc and b/nlp_resource_data/nltk/__pycache__/lazyimport.cpython-37.pyc differ
index 6897e4f..f113338 100644 (file)
Binary files a/nlp_resource_data/nltk/__pycache__/probability.cpython-37.pyc and b/nlp_resource_data/nltk/__pycache__/probability.cpython-37.pyc differ
index 8740f67..8e5f489 100644 (file)
Binary files a/nlp_resource_data/nltk/__pycache__/text.cpython-37.pyc and b/nlp_resource_data/nltk/__pycache__/text.cpython-37.pyc differ
index 84f29c7..4d6ff92 100644 (file)
Binary files a/nlp_resource_data/nltk/__pycache__/tgrep.cpython-37.pyc and b/nlp_resource_data/nltk/__pycache__/tgrep.cpython-37.pyc differ
index d7552ab..e95e636 100644 (file)
Binary files a/nlp_resource_data/nltk/__pycache__/toolbox.cpython-37.pyc and b/nlp_resource_data/nltk/__pycache__/toolbox.cpython-37.pyc differ
index c737e0a..3fa1eb0 100644 (file)
Binary files a/nlp_resource_data/nltk/__pycache__/tree.cpython-37.pyc and b/nlp_resource_data/nltk/__pycache__/tree.cpython-37.pyc differ
index e4a9364..8fe009d 100644 (file)
Binary files a/nlp_resource_data/nltk/__pycache__/treeprettyprinter.cpython-37.pyc and b/nlp_resource_data/nltk/__pycache__/treeprettyprinter.cpython-37.pyc differ
index 9e4dd09..baa8cdd 100644 (file)
Binary files a/nlp_resource_data/nltk/__pycache__/treetransforms.cpython-37.pyc and b/nlp_resource_data/nltk/__pycache__/treetransforms.cpython-37.pyc differ
index 3bd6059..b175bc9 100644 (file)
Binary files a/nlp_resource_data/nltk/__pycache__/util.cpython-37.pyc and b/nlp_resource_data/nltk/__pycache__/util.cpython-37.pyc differ
index 8b85d4d..2772e4b 100644 (file)
Binary files a/nlp_resource_data/nltk/__pycache__/wsd.cpython-37.pyc and b/nlp_resource_data/nltk/__pycache__/wsd.cpython-37.pyc differ
index 458ac4c..19157ac 100644 (file)
@@ -1,6 +1,6 @@
 # Natural Language Toolkit: Applications package
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Edward Loper <edloper@gmail.com>
 #         Steven Bird <stevenbird1@gmail.com>
 # URL: <http://nltk.org/>
@@ -22,7 +22,7 @@ wordnet:      WordNet Browser
 
 # Import Tkinter-based modules if Tkinter is installed
 try:
-    import tkinter
+    from six.moves import tkinter
 except ImportError:
     import warnings
 
index cc90f65..54f030e 100644 (file)
Binary files a/nlp_resource_data/nltk/app/__pycache__/__init__.cpython-37.pyc and b/nlp_resource_data/nltk/app/__pycache__/__init__.cpython-37.pyc differ
index 10a3a53..887e567 100644 (file)
Binary files a/nlp_resource_data/nltk/app/__pycache__/chartparser_app.cpython-37.pyc and b/nlp_resource_data/nltk/app/__pycache__/chartparser_app.cpython-37.pyc differ
index c0a1d94..699110a 100644 (file)
Binary files a/nlp_resource_data/nltk/app/__pycache__/chunkparser_app.cpython-37.pyc and b/nlp_resource_data/nltk/app/__pycache__/chunkparser_app.cpython-37.pyc differ
index ad048ca..f63c308 100644 (file)
Binary files a/nlp_resource_data/nltk/app/__pycache__/collocations_app.cpython-37.pyc and b/nlp_resource_data/nltk/app/__pycache__/collocations_app.cpython-37.pyc differ
index bfb4ca5..c153a83 100644 (file)
Binary files a/nlp_resource_data/nltk/app/__pycache__/concordance_app.cpython-37.pyc and b/nlp_resource_data/nltk/app/__pycache__/concordance_app.cpython-37.pyc differ
index 2fc5157..51b22e2 100644 (file)
Binary files a/nlp_resource_data/nltk/app/__pycache__/nemo_app.cpython-37.pyc and b/nlp_resource_data/nltk/app/__pycache__/nemo_app.cpython-37.pyc differ
index 7051cd5..c246594 100644 (file)
Binary files a/nlp_resource_data/nltk/app/__pycache__/rdparser_app.cpython-37.pyc and b/nlp_resource_data/nltk/app/__pycache__/rdparser_app.cpython-37.pyc differ
index f3c02a8..bdc6853 100644 (file)
Binary files a/nlp_resource_data/nltk/app/__pycache__/srparser_app.cpython-37.pyc and b/nlp_resource_data/nltk/app/__pycache__/srparser_app.cpython-37.pyc differ
index 0532e32..131b18b 100644 (file)
Binary files a/nlp_resource_data/nltk/app/__pycache__/wordfreq_app.cpython-37.pyc and b/nlp_resource_data/nltk/app/__pycache__/wordfreq_app.cpython-37.pyc differ
index da15b5f..f6f752c 100644 (file)
Binary files a/nlp_resource_data/nltk/app/__pycache__/wordnet_app.cpython-37.pyc and b/nlp_resource_data/nltk/app/__pycache__/wordnet_app.cpython-37.pyc differ
index 92fff32..d42095e 100644 (file)
@@ -1,6 +1,6 @@
 # Natural Language Toolkit: Chart Parser Application
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Edward Loper <edloper@gmail.com>
 #         Jean Mark Gawron <gawron@mail.sdsu.edu>
 #         Steven Bird <stevenbird1@gmail.com>
@@ -37,10 +37,11 @@ edge you wish to apply a rule to.
 # widget system.
 
 
+from __future__ import division
 import pickle
 import os.path
 
-from tkinter import (
+from six.moves.tkinter import (
     Button,
     Canvas,
     Checkbutton,
@@ -52,9 +53,9 @@ from tkinter import (
     Tk,
     Toplevel,
 )
-from tkinter.font import Font
-from tkinter.messagebox import showerror, showinfo
-from tkinter.filedialog import asksaveasfilename, askopenfilename
+from six.moves.tkinter_font import Font
+from six.moves.tkinter_messagebox import showerror, showinfo
+from six.moves.tkinter_tkfiledialog import asksaveasfilename, askopenfilename
 
 from nltk.parse.chart import (
     BottomUpPredictCombineRule,
@@ -90,29 +91,29 @@ from nltk.draw import CFGEditor, tree_to_treesegment, TreeSegmentWidget
 
 
 class EdgeList(ColorizedList):
-    ARROW = SymbolWidget.SYMBOLS["rightarrow"]
+    ARROW = SymbolWidget.SYMBOLS['rightarrow']
 
     def _init_colortags(self, textwidget, options):
-        textwidget.tag_config("terminal", foreground="#006000")
-        textwidget.tag_config("arrow", font="symbol", underline="0")
-        textwidget.tag_config("dot", foreground="#000000")
+        textwidget.tag_config('terminal', foreground='#006000')
+        textwidget.tag_config('arrow', font='symbol', underline='0')
+        textwidget.tag_config('dot', foreground='#000000')
         textwidget.tag_config(
-            "nonterminal", foreground="blue", font=("helvetica", -12, "bold")
+            'nonterminal', foreground='blue', font=('helvetica', -12, 'bold')
         )
 
     def _item_repr(self, item):
         contents = []
-        contents.append(("%s\t" % item.lhs(), "nonterminal"))
-        contents.append((self.ARROW, "arrow"))
+        contents.append(('%s\t' % item.lhs(), 'nonterminal'))
+        contents.append((self.ARROW, 'arrow'))
         for i, elt in enumerate(item.rhs()):
             if i == item.dot():
-                contents.append((" *", "dot"))
+                contents.append((' *', 'dot'))
             if isinstance(elt, Nonterminal):
-                contents.append((" %s" % elt.symbol(), "nonterminal"))
+                contents.append((' %s' % elt.symbol(), 'nonterminal'))
             else:
-                contents.append((" %r" % elt, "terminal"))
+                contents.append((' %r' % elt, 'terminal'))
         if item.is_complete():
-            contents.append((" *", "dot"))
+            contents.append((' *', 'dot'))
         return contents
 
 
@@ -127,7 +128,7 @@ class ChartMatrixView(object):
     """
 
     def __init__(
-        self, parent, chart, toplevel=True, title="Chart Matrix", show_numedges=False
+        self, parent, chart, toplevel=True, title='Chart Matrix', show_numedges=False
     ):
         self._chart = chart
         self._cells = []
@@ -138,7 +139,7 @@ class ChartMatrixView(object):
         if toplevel:
             self._root = Toplevel(parent)
             self._root.title(title)
-            self._root.bind("<Control-q>", self.destroy)
+            self._root.bind('<Control-q>', self.destroy)
             self._init_quit(self._root)
         else:
             self._root = Frame(parent)
@@ -157,27 +158,27 @@ class ChartMatrixView(object):
         self.draw()
 
     def _init_quit(self, root):
-        quit = Button(root, text="Quit", command=self.destroy)
-        quit.pack(side="bottom", expand=0, fill="none")
+        quit = Button(root, text='Quit', command=self.destroy)
+        quit.pack(side='bottom', expand=0, fill='none')
 
     def _init_matrix(self, root):
-        cframe = Frame(root, border=2, relief="sunken")
-        cframe.pack(expand=0, fill="none", padx=1, pady=3, side="top")
-        self._canvas = Canvas(cframe, width=200, height=200, background="white")
-        self._canvas.pack(expand=0, fill="none")
+        cframe = Frame(root, border=2, relief='sunken')
+        cframe.pack(expand=0, fill='none', padx=1, pady=3, side='top')
+        self._canvas = Canvas(cframe, width=200, height=200, background='white')
+        self._canvas.pack(expand=0, fill='none')
 
     def _init_numedges(self, root):
-        self._numedges_label = Label(root, text="0 edges")
-        self._numedges_label.pack(expand=0, fill="none", side="top")
+        self._numedges_label = Label(root, text='0 edges')
+        self._numedges_label.pack(expand=0, fill='none', side='top')
 
     def _init_list(self, root):
         self._list = EdgeList(root, [], width=20, height=5)
-        self._list.pack(side="top", expand=1, fill="both", pady=3)
+        self._list.pack(side='top', expand=1, fill='both', pady=3)
 
         def cb(edge, self=self):
-            self._fire_callbacks("select", edge)
+            self._fire_callbacks('select', edge)
 
-        self._list.add_callback("select", cb)
+        self._list.add_callback('select', cb)
         self._list.focus()
 
     def destroy(self, *e):
@@ -209,19 +210,19 @@ class ChartMatrixView(object):
         for i in range(N):
             for j in range(i, N):
                 if cell_edges[i][j] == 0:
-                    color = "gray20"
+                    color = 'gray20'
                 else:
-                    color = "#00%02x%02x" % (
+                    color = '#00%02x%02x' % (
                         min(255, 50 + 128 * cell_edges[i][j] / 10),
                         max(0, 128 - 128 * cell_edges[i][j] / 10),
                     )
                 cell_tag = self._cells[i][j]
                 self._canvas.itemconfig(cell_tag, fill=color)
                 if (i, j) == self._selected_cell:
-                    self._canvas.itemconfig(cell_tag, outline="#00ffff", width=3)
+                    self._canvas.itemconfig(cell_tag, outline='#00ffff', width=3)
                     self._canvas.tag_raise(cell_tag)
                 else:
-                    self._canvas.itemconfig(cell_tag, outline="black", width=1)
+                    self._canvas.itemconfig(cell_tag, outline='black', width=1)
 
         # Update the edge list.
         edges = list(self._chart.select(span=self._selected_cell))
@@ -230,14 +231,14 @@ class ChartMatrixView(object):
         # Update our edge count.
         self._num_edges = self._chart.num_edges()
         if self._numedges_label is not None:
-            self._numedges_label["text"] = "%d edges" % self._num_edges
+            self._numedges_label['text'] = '%d edges' % self._num_edges
 
     def activate(self):
-        self._canvas.itemconfig("inactivebox", state="hidden")
+        self._canvas.itemconfig('inactivebox', state='hidden')
         self.update()
 
     def inactivate(self):
-        self._canvas.itemconfig("inactivebox", state="normal")
+        self._canvas.itemconfig('inactivebox', state='normal')
         self.update()
 
     def add_callback(self, event, func):
@@ -271,7 +272,7 @@ class ChartMatrixView(object):
         self.update()
 
         # Fire the callback.
-        self._fire_callbacks("select_cell", i, j)
+        self._fire_callbacks('select_cell', i, j)
 
     def deselect_cell(self):
         if self._root is None:
@@ -313,37 +314,37 @@ class ChartMatrixView(object):
         LEFT_MARGIN = BOT_MARGIN = 15
         TOP_MARGIN = 5
         c = self._canvas
-        c.delete("all")
+        c.delete('all')
         N = self._chart.num_leaves() + 1
-        dx = (int(c["width"]) - LEFT_MARGIN) / N
-        dy = (int(c["height"]) - TOP_MARGIN - BOT_MARGIN) / N
+        dx = (int(c['width']) - LEFT_MARGIN) / N
+        dy = (int(c['height']) - TOP_MARGIN - BOT_MARGIN) / N
 
-        c.delete("all")
+        c.delete('all')
 
         # Labels and dotted lines
         for i in range(N):
             c.create_text(
-                LEFT_MARGIN - 2, i * dy + dy / 2 + TOP_MARGIN, text=repr(i), anchor="e"
+                LEFT_MARGIN - 2, i * dy + dy / 2 + TOP_MARGIN, text=repr(i), anchor='e'
             )
             c.create_text(
                 i * dx + dx / 2 + LEFT_MARGIN,
                 N * dy + TOP_MARGIN + 1,
                 text=repr(i),
-                anchor="n",
+                anchor='n',
             )
             c.create_line(
                 LEFT_MARGIN,
                 dy * (i + 1) + TOP_MARGIN,
                 dx * N + LEFT_MARGIN,
                 dy * (i + 1) + TOP_MARGIN,
-                dash=".",
+                dash='.',
             )
             c.create_line(
                 dx * i + LEFT_MARGIN,
                 TOP_MARGIN,
                 dx * i + LEFT_MARGIN,
                 dy * N + TOP_MARGIN,
-                dash=".",
+                dash='.',
             )
 
         # A box around the whole thing
@@ -360,25 +361,25 @@ class ChartMatrixView(object):
                     i * dy + TOP_MARGIN,
                     (j + 1) * dx + LEFT_MARGIN,
                     (i + 1) * dy + TOP_MARGIN,
-                    fill="gray20",
+                    fill='gray20',
                 )
                 self._cells[i][j] = t
 
                 def cb(event, self=self, i=i, j=j):
                     self._click_cell(i, j)
 
-                c.tag_bind(t, "<Button-1>", cb)
+                c.tag_bind(t, '<Button-1>', cb)
 
         # Inactive box
-        xmax, ymax = int(c["width"]), int(c["height"])
+        xmax, ymax = int(c['width']), int(c['height'])
         t = c.create_rectangle(
             -100,
             -100,
             xmax + 100,
             ymax + 100,
-            fill="gray50",
-            state="hidden",
-            tag="inactivebox",
+            fill='gray50',
+            state='hidden',
+            tag='inactivebox',
         )
         c.tag_lower(t)
 
@@ -406,24 +407,24 @@ class ChartResultsView(object):
 
         if toplevel:
             self._root = Toplevel(parent)
-            self._root.title("Chart Parser Application: Results")
-            self._root.bind("<Control-q>", self.destroy)
+            self._root.title('Chart Parser Application: Results')
+            self._root.bind('<Control-q>', self.destroy)
         else:
             self._root = Frame(parent)
 
         # Buttons
         if toplevel:
             buttons = Frame(self._root)
-            buttons.pack(side="bottom", expand=0, fill="x")
-            Button(buttons, text="Quit", command=self.destroy).pack(side="right")
-            Button(buttons, text="Print All", command=self.print_all).pack(side="left")
-            Button(buttons, text="Print Selection", command=self.print_selection).pack(
-                side="left"
+            buttons.pack(side='bottom', expand=0, fill='x')
+            Button(buttons, text='Quit', command=self.destroy).pack(side='right')
+            Button(buttons, text='Print All', command=self.print_all).pack(side='left')
+            Button(buttons, text='Print Selection', command=self.print_selection).pack(
+                side='left'
             )
 
         # Canvas frame.
         self._cframe = CanvasFrame(self._root, closeenough=20)
-        self._cframe.pack(side="top", expand=1, fill="both")
+        self._cframe.pack(side='top', expand=1, fill='both')
 
         # Initial update
         self.update()
@@ -466,15 +467,15 @@ class ChartResultsView(object):
             c.delete(self._selectbox)
         self._selection = widget
         (x1, y1, x2, y2) = widget.bbox()
-        self._selectbox = c.create_rectangle(x1, y1, x2, y2, width=2, outline="#088")
+        self._selectbox = c.create_rectangle(x1, y1, x2, y2, width=2, outline='#088')
 
     def _color(self, treewidget, color):
-        treewidget.label()["color"] = color
+        treewidget.label()['color'] = color
         for child in treewidget.subtrees():
             if isinstance(child, TreeSegmentWidget):
                 self._color(child, color)
             else:
-                child["color"] = color
+                child['color'] = color
 
     def print_all(self, *e):
         if self._root is None:
@@ -485,7 +486,7 @@ class ChartResultsView(object):
         if self._root is None:
             return
         if self._selection is None:
-            showerror("Print Error", "No tree selected")
+            showerror('Print Error', 'No tree selected')
         else:
             c = self._cframe.canvas()
             for widget in self._treewidgets:
@@ -494,7 +495,7 @@ class ChartResultsView(object):
             c.delete(self._selectbox)
             (x1, y1, x2, y2) = self._selection.bbox()
             self._selection.move(10 - x1, 10 - y1)
-            c["scrollregion"] = "0 0 %s %s" % (x2 - x1 + 20, y2 - y1 + 20)
+            c['scrollregion'] = '0 0 %s %s' % (x2 - x1 + 20, y2 - y1 + 20)
             self._cframe.print_to_file()
 
             # Restore our state.
@@ -571,25 +572,25 @@ class ChartComparer(object):
     """
 
     _OPSYMBOL = {
-        "-": "-",
-        "and": SymbolWidget.SYMBOLS["intersection"],
-        "or": SymbolWidget.SYMBOLS["union"],
+        '-': '-',
+        'and': SymbolWidget.SYMBOLS['intersection'],
+        'or': SymbolWidget.SYMBOLS['union'],
     }
 
     def __init__(self, *chart_filenames):
         # This chart is displayed when we don't have a value (eg
         # before any chart is loaded).
-        faketok = [""] * 8
+        faketok = [''] * 8
         self._emptychart = Chart(faketok)
 
         # The left & right charts start out empty.
-        self._left_name = "None"
-        self._right_name = "None"
+        self._left_name = 'None'
+        self._right_name = 'None'
         self._left_chart = self._emptychart
         self._right_chart = self._emptychart
 
         # The charts that have been loaded.
-        self._charts = {"None": self._emptychart}
+        self._charts = {'None': self._emptychart}
 
         # The output chart.
         self._out_chart = self._emptychart
@@ -599,9 +600,9 @@ class ChartComparer(object):
 
         # Set up the root window.
         self._root = Tk()
-        self._root.title("Chart Comparison")
-        self._root.bind("<Control-q>", self.destroy)
-        self._root.bind("<Control-x>", self.destroy)
+        self._root.title('Chart Comparison')
+        self._root.bind('<Control-q>', self.destroy)
+        self._root.bind('<Control-x>', self.destroy)
 
         # Initialize all widgets, etc.
         self._init_menubar(self._root)
@@ -637,122 +638,122 @@ class ChartComparer(object):
         # File menu
         filemenu = Menu(menubar, tearoff=0)
         filemenu.add_command(
-            label="Load Chart",
-            accelerator="Ctrl-o",
+            label='Load Chart',
+            accelerator='Ctrl-o',
             underline=0,
             command=self.load_chart_dialog,
         )
         filemenu.add_command(
-            label="Save Output",
-            accelerator="Ctrl-s",
+            label='Save Output',
+            accelerator='Ctrl-s',
             underline=0,
             command=self.save_chart_dialog,
         )
         filemenu.add_separator()
         filemenu.add_command(
-            label="Exit", underline=1, command=self.destroy, accelerator="Ctrl-x"
+            label='Exit', underline=1, command=self.destroy, accelerator='Ctrl-x'
         )
-        menubar.add_cascade(label="File", underline=0, menu=filemenu)
+        menubar.add_cascade(label='File', underline=0, menu=filemenu)
 
         # Compare menu
         opmenu = Menu(menubar, tearoff=0)
         opmenu.add_command(
-            label="Intersection", command=self._intersection, accelerator="+"
+            label='Intersection', command=self._intersection, accelerator='+'
         )
-        opmenu.add_command(label="Union", command=self._union, accelerator="*")
+        opmenu.add_command(label='Union', command=self._union, accelerator='*')
         opmenu.add_command(
-            label="Difference", command=self._difference, accelerator="-"
+            label='Difference', command=self._difference, accelerator='-'
         )
         opmenu.add_separator()
-        opmenu.add_command(label="Swap Charts", command=self._swapcharts)
-        menubar.add_cascade(label="Compare", underline=0, menu=opmenu)
+        opmenu.add_command(label='Swap Charts', command=self._swapcharts)
+        menubar.add_cascade(label='Compare', underline=0, menu=opmenu)
 
         # Add the menu
         self._root.config(menu=menubar)
 
     def _init_divider(self, root):
-        divider = Frame(root, border=2, relief="sunken")
-        divider.pack(side="top", fill="x", ipady=2)
+        divider = Frame(root, border=2, relief='sunken')
+        divider.pack(side='top', fill='x', ipady=2)
 
     def _init_chartviews(self, root):
-        opfont = ("symbol", -36)  # Font for operator.
-        eqfont = ("helvetica", -36)  # Font for equals sign.
+        opfont = ('symbol', -36)  # Font for operator.
+        eqfont = ('helvetica', -36)  # Font for equals sign.
 
-        frame = Frame(root, background="#c0c0c0")
-        frame.pack(side="top", expand=1, fill="both")
+        frame = Frame(root, background='#c0c0c0')
+        frame.pack(side='top', expand=1, fill='both')
 
         # The left matrix.
-        cv1_frame = Frame(frame, border=3, relief="groove")
-        cv1_frame.pack(side="left", padx=8, pady=7, expand=1, fill="both")
+        cv1_frame = Frame(frame, border=3, relief='groove')
+        cv1_frame.pack(side='left', padx=8, pady=7, expand=1, fill='both')
         self._left_selector = MutableOptionMenu(
             cv1_frame, list(self._charts.keys()), command=self._select_left
         )
-        self._left_selector.pack(side="top", pady=5, fill="x")
+        self._left_selector.pack(side='top', pady=5, fill='x')
         self._left_matrix = ChartMatrixView(
             cv1_frame, self._emptychart, toplevel=False, show_numedges=True
         )
-        self._left_matrix.pack(side="bottom", padx=5, pady=5, expand=1, fill="both")
-        self._left_matrix.add_callback("select", self.select_edge)
-        self._left_matrix.add_callback("select_cell", self.select_cell)
+        self._left_matrix.pack(side='bottom', padx=5, pady=5, expand=1, fill='both')
+        self._left_matrix.add_callback('select', self.select_edge)
+        self._left_matrix.add_callback('select_cell', self.select_cell)
         self._left_matrix.inactivate()
 
         # The operator.
         self._op_label = Label(
-            frame, text=" ", width=3, background="#c0c0c0", font=opfont
+            frame, text=' ', width=3, background='#c0c0c0', font=opfont
         )
-        self._op_label.pack(side="left", padx=5, pady=5)
+        self._op_label.pack(side='left', padx=5, pady=5)
 
         # The right matrix.
-        cv2_frame = Frame(frame, border=3, relief="groove")
-        cv2_frame.pack(side="left", padx=8, pady=7, expand=1, fill="both")
+        cv2_frame = Frame(frame, border=3, relief='groove')
+        cv2_frame.pack(side='left', padx=8, pady=7, expand=1, fill='both')
         self._right_selector = MutableOptionMenu(
             cv2_frame, list(self._charts.keys()), command=self._select_right
         )
-        self._right_selector.pack(side="top", pady=5, fill="x")
+        self._right_selector.pack(side='top', pady=5, fill='x')
         self._right_matrix = ChartMatrixView(
             cv2_frame, self._emptychart, toplevel=False, show_numedges=True
         )
-        self._right_matrix.pack(side="bottom", padx=5, pady=5, expand=1, fill="both")
-        self._right_matrix.add_callback("select", self.select_edge)
-        self._right_matrix.add_callback("select_cell", self.select_cell)
+        self._right_matrix.pack(side='bottom', padx=5, pady=5, expand=1, fill='both')
+        self._right_matrix.add_callback('select', self.select_edge)
+        self._right_matrix.add_callback('select_cell', self.select_cell)
         self._right_matrix.inactivate()
 
         # The equals sign
-        Label(frame, text="=", width=3, background="#c0c0c0", font=eqfont).pack(
-            side="left", padx=5, pady=5
+        Label(frame, text='=', width=3, background='#c0c0c0', font=eqfont).pack(
+            side='left', padx=5, pady=5
         )
 
         # The output matrix.
-        out_frame = Frame(frame, border=3, relief="groove")
-        out_frame.pack(side="left", padx=8, pady=7, expand=1, fill="both")
-        self._out_label = Label(out_frame, text="Output")
-        self._out_label.pack(side="top", pady=9)
+        out_frame = Frame(frame, border=3, relief='groove')
+        out_frame.pack(side='left', padx=8, pady=7, expand=1, fill='both')
+        self._out_label = Label(out_frame, text='Output')
+        self._out_label.pack(side='top', pady=9)
         self._out_matrix = ChartMatrixView(
             out_frame, self._emptychart, toplevel=False, show_numedges=True
         )
-        self._out_matrix.pack(side="bottom", padx=5, pady=5, expand=1, fill="both")
-        self._out_matrix.add_callback("select", self.select_edge)
-        self._out_matrix.add_callback("select_cell", self.select_cell)
+        self._out_matrix.pack(side='bottom', padx=5, pady=5, expand=1, fill='both')
+        self._out_matrix.add_callback('select', self.select_edge)
+        self._out_matrix.add_callback('select_cell', self.select_cell)
         self._out_matrix.inactivate()
 
     def _init_buttons(self, root):
         buttons = Frame(root)
-        buttons.pack(side="bottom", pady=5, fill="x", expand=0)
-        Button(buttons, text="Intersection", command=self._intersection).pack(
-            side="left"
+        buttons.pack(side='bottom', pady=5, fill='x', expand=0)
+        Button(buttons, text='Intersection', command=self._intersection).pack(
+            side='left'
         )
-        Button(buttons, text="Union", command=self._union).pack(side="left")
-        Button(buttons, text="Difference", command=self._difference).pack(side="left")
-        Frame(buttons, width=20).pack(side="left")
-        Button(buttons, text="Swap Charts", command=self._swapcharts).pack(side="left")
+        Button(buttons, text='Union', command=self._union).pack(side='left')
+        Button(buttons, text='Difference', command=self._difference).pack(side='left')
+        Frame(buttons, width=20).pack(side='left')
+        Button(buttons, text='Swap Charts', command=self._swapcharts).pack(side='left')
 
-        Button(buttons, text="Detatch Output", command=self._detatch_out).pack(
-            side="right"
+        Button(buttons, text='Detatch Output', command=self._detatch_out).pack(
+            side='right'
         )
 
     def _init_bindings(self, root):
         # root.bind('<Control-s>', self.save_chart)
-        root.bind("<Control-o>", self.load_chart_dialog)
+        root.bind('<Control-o>', self.load_chart_dialog)
         # root.bind('<Control-r>', self.reset)
 
     # ////////////////////////////////////////////////////////////
@@ -763,7 +764,7 @@ class ChartComparer(object):
         self._left_name = name
         self._left_chart = self._charts[name]
         self._left_matrix.set_chart(self._left_chart)
-        if name == "None":
+        if name == 'None':
             self._left_matrix.inactivate()
         self._apply_op()
 
@@ -771,40 +772,40 @@ class ChartComparer(object):
         self._right_name = name
         self._right_chart = self._charts[name]
         self._right_matrix.set_chart(self._right_chart)
-        if name == "None":
+        if name == 'None':
             self._right_matrix.inactivate()
         self._apply_op()
 
     def _apply_op(self):
-        if self._operator == "-":
+        if self._operator == '-':
             self._difference()
-        elif self._operator == "or":
+        elif self._operator == 'or':
             self._union()
-        elif self._operator == "and":
+        elif self._operator == 'and':
             self._intersection()
 
     # ////////////////////////////////////////////////////////////
     # File
     # ////////////////////////////////////////////////////////////
-    CHART_FILE_TYPES = [("Pickle file", ".pickle"), ("All files", "*")]
+    CHART_FILE_TYPES = [('Pickle file', '.pickle'), ('All files', '*')]
 
     def save_chart_dialog(self, *args):
         filename = asksaveasfilename(
-            filetypes=self.CHART_FILE_TYPES, defaultextension=".pickle"
+            filetypes=self.CHART_FILE_TYPES, defaultextension='.pickle'
         )
         if not filename:
             return
         try:
-            with open(filename, "wb") as outfile:
+            with open(filename, 'wb') as outfile:
                 pickle.dump(self._out_chart, outfile)
         except Exception as e:
             showerror(
-                "Error Saving Chart", "Unable to open file: %r\n%s" % (filename, e)
+                'Error Saving Chart', 'Unable to open file: %r\n%s' % (filename, e)
             )
 
     def load_chart_dialog(self, *args):
         filename = askopenfilename(
-            filetypes=self.CHART_FILE_TYPES, defaultextension=".pickle"
+            filetypes=self.CHART_FILE_TYPES, defaultextension='.pickle'
         )
         if not filename:
             return
@@ -812,16 +813,16 @@ class ChartComparer(object):
             self.load_chart(filename)
         except Exception as e:
             showerror(
-                "Error Loading Chart", "Unable to open file: %r\n%s" % (filename, e)
+                'Error Loading Chart', 'Unable to open file: %r\n%s' % (filename, e)
             )
 
     def load_chart(self, filename):
-        with open(filename, "rb") as infile:
+        with open(filename, 'rb') as infile:
             chart = pickle.load(infile)
         name = os.path.basename(filename)
-        if name.endswith(".pickle"):
+        if name.endswith('.pickle'):
             name = name[:-7]
-        if name.endswith(".chart"):
+        if name.endswith('.chart'):
             name = name[:-6]
         self._charts[name] = chart
         self._left_selector.add(name)
@@ -875,7 +876,7 @@ class ChartComparer(object):
             if edge not in self._right_chart:
                 out_chart.insert(edge, [])
 
-        self._update("-", out_chart)
+        self._update('-', out_chart)
 
     def _intersection(self):
         if not self._checkcompat():
@@ -886,7 +887,7 @@ class ChartComparer(object):
             if edge in self._right_chart:
                 out_chart.insert(edge, [])
 
-        self._update("and", out_chart)
+        self._update('and', out_chart)
 
     def _union(self):
         if not self._checkcompat():
@@ -898,7 +899,7 @@ class ChartComparer(object):
         for edge in self._right_chart:
             out_chart.insert(edge, [])
 
-        self._update("or", out_chart)
+        self._update('or', out_chart)
 
     def _swapcharts(self):
         left, right = self._left_name, self._right_name
@@ -916,7 +917,7 @@ class ChartComparer(object):
             self._out_chart = self._emptychart
             self._out_matrix.set_chart(self._out_chart)
             self._out_matrix.inactivate()
-            self._out_label["text"] = "Output"
+            self._out_label['text'] = 'Output'
             # Issue some other warning?
             return False
         else:
@@ -924,10 +925,10 @@ class ChartComparer(object):
 
     def _update(self, operator, out_chart):
         self._operator = operator
-        self._op_label["text"] = self._OPSYMBOL[operator]
+        self._op_label['text'] = self._OPSYMBOL[operator]
         self._out_chart = out_chart
         self._out_matrix.set_chart(out_chart)
-        self._out_label["text"] = "%s %s %s" % (
+        self._out_label['text'] = '%s %s %s' % (
             self._left_name,
             self._operator,
             self._right_name,
@@ -936,11 +937,11 @@ class ChartComparer(object):
     def _clear_out_chart(self):
         self._out_chart = self._emptychart
         self._out_matrix.set_chart(self._out_chart)
-        self._op_label["text"] = " "
+        self._op_label['text'] = ' '
         self._out_matrix.inactivate()
 
     def _detatch_out(self):
-        ChartMatrixView(self._root, self._out_chart, title=self._out_label["text"])
+        ChartMatrixView(self._root, self._out_chart, title=self._out_label['text'])
 
 
 #######################################################################
@@ -1005,9 +1006,9 @@ class ChartView(object):
         Construct a new ``Chart`` display.
         """
         # Process keyword args.
-        draw_tree = kw.get("draw_tree", 0)
-        draw_sentence = kw.get("draw_sentence", 1)
-        self._fontsize = kw.get("fontsize", -12)
+        draw_tree = kw.get('draw_tree', 0)
+        draw_sentence = kw.get('draw_sentence', 1)
+        self._fontsize = kw.get('fontsize', -12)
 
         # The chart!
         self._chart = chart
@@ -1037,7 +1038,7 @@ class ChartView(object):
         # If they didn't provide a main window, then set one up.
         if root is None:
             top = Tk()
-            top.title("Chart View")
+            top.title('Chart View')
 
             def destroy1(e, top=top):
                 top.destroy()
@@ -1045,9 +1046,9 @@ class ChartView(object):
             def destroy2(top=top):
                 top.destroy()
 
-            top.bind("q", destroy1)
-            b = Button(top, text="Done", command=destroy2)
-            b.pack(side="bottom")
+            top.bind('q', destroy1)
+            b = Button(top, text='Done', command=destroy2)
+            b.pack(side='bottom')
             self._root = top
         else:
             self._root = root
@@ -1057,25 +1058,25 @@ class ChartView(object):
 
         # Create the chart canvas.
         (self._chart_sb, self._chart_canvas) = self._sb_canvas(self._root)
-        self._chart_canvas["height"] = 300
-        self._chart_canvas["closeenough"] = 15
+        self._chart_canvas['height'] = 300
+        self._chart_canvas['closeenough'] = 15
 
         # Create the sentence canvas.
         if draw_sentence:
-            cframe = Frame(self._root, relief="sunk", border=2)
-            cframe.pack(fill="both", side="bottom")
+            cframe = Frame(self._root, relief='sunk', border=2)
+            cframe.pack(fill='both', side='bottom')
             self._sentence_canvas = Canvas(cframe, height=50)
-            self._sentence_canvas["background"] = "#e0e0e0"
-            self._sentence_canvas.pack(fill="both")
+            self._sentence_canvas['background'] = '#e0e0e0'
+            self._sentence_canvas.pack(fill='both')
             # self._sentence_canvas['height'] = self._sentence_height
         else:
             self._sentence_canvas = None
 
         # Create the tree canvas.
         if draw_tree:
-            (sb, canvas) = self._sb_canvas(self._root, "n", "x")
+            (sb, canvas) = self._sb_canvas(self._root, 'n', 'x')
             (self._tree_sb, self._tree_canvas) = (sb, canvas)
-            self._tree_canvas["height"] = 200
+            self._tree_canvas['height'] = 200
         else:
             self._tree_canvas = None
 
@@ -1087,45 +1088,45 @@ class ChartView(object):
 
         # Set up the configure callback, which will be called whenever
         # the window is resized.
-        self._chart_canvas.bind("<Configure>", self._configure)
+        self._chart_canvas.bind('<Configure>', self._configure)
 
     def _init_fonts(self, root):
-        self._boldfont = Font(family="helvetica", weight="bold", size=self._fontsize)
-        self._font = Font(family="helvetica", size=self._fontsize)
+        self._boldfont = Font(family='helvetica', weight='bold', size=self._fontsize)
+        self._font = Font(family='helvetica', size=self._fontsize)
         # See: <http://www.astro.washington.edu/owen/ROTKFolklore.html>
         self._sysfont = Font(font=Button()["font"])
         root.option_add("*Font", self._sysfont)
 
-    def _sb_canvas(self, root, expand="y", fill="both", side="bottom"):
+    def _sb_canvas(self, root, expand='y', fill='both', side='bottom'):
         """
         Helper for __init__: construct a canvas with a scrollbar.
         """
-        cframe = Frame(root, relief="sunk", border=2)
+        cframe = Frame(root, relief='sunk', border=2)
         cframe.pack(fill=fill, expand=expand, side=side)
-        canvas = Canvas(cframe, background="#e0e0e0")
+        canvas = Canvas(cframe, background='#e0e0e0')
 
         # Give the canvas a scrollbar.
-        sb = Scrollbar(cframe, orient="vertical")
-        sb.pack(side="right", fill="y")
-        canvas.pack(side="left", fill=fill, expand="yes")
+        sb = Scrollbar(cframe, orient='vertical')
+        sb.pack(side='right', fill='y')
+        canvas.pack(side='left', fill=fill, expand='yes')
 
         # Connect the scrollbars to the canvas.
-        sb["command"] = canvas.yview
-        canvas["yscrollcommand"] = sb.set
+        sb['command'] = canvas.yview
+        canvas['yscrollcommand'] = sb.set
 
         return (sb, canvas)
 
     def scroll_up(self, *e):
-        self._chart_canvas.yview("scroll", -1, "units")
+        self._chart_canvas.yview('scroll', -1, 'units')
 
     def scroll_down(self, *e):
-        self._chart_canvas.yview("scroll", 1, "units")
+        self._chart_canvas.yview('scroll', 1, 'units')
 
     def page_up(self, *e):
-        self._chart_canvas.yview("scroll", -1, "pages")
+        self._chart_canvas.yview('scroll', -1, 'pages')
 
     def page_down(self, *e):
-        self._chart_canvas.yview("scroll", 1, "pages")
+        self._chart_canvas.yview('scroll', 1, 'pages')
 
     def _grow(self):
         """
@@ -1134,19 +1135,19 @@ class ChartView(object):
         # Grow, if need-be
         N = self._chart.num_leaves()
         width = max(
-            int(self._chart_canvas["width"]), N * self._unitsize + ChartView._MARGIN * 2
+            int(self._chart_canvas['width']), N * self._unitsize + ChartView._MARGIN * 2
         )
 
         # It won't resize without the second (height) line, but I
         # don't understand why not.
         self._chart_canvas.configure(width=width)
-        self._chart_canvas.configure(height=self._chart_canvas["height"])
+        self._chart_canvas.configure(height=self._chart_canvas['height'])
 
         self._unitsize = (width - 2 * ChartView._MARGIN) / N
 
         # Reset the height for the sentence window.
         if self._sentence_canvas is not None:
-            self._sentence_canvas["height"] = self._sentence_height
+            self._sentence_canvas['height'] = self._sentence_height
 
     def set_font_size(self, size):
         self._font.configure(size=-abs(size))
@@ -1229,11 +1230,11 @@ class ChartView(object):
             rhs = " ".join(rhselts)
         else:
             lhs = edge.lhs()
-            rhs = ""
+            rhs = ''
 
         for s in (lhs, rhs):
             tag = c.create_text(
-                0, 0, text=s, font=self._boldfont, anchor="nw", justify="left"
+                0, 0, text=s, font=self._boldfont, anchor='nw', justify='left'
             )
             bbox = c.bbox(tag)
             c.delete(tag)
@@ -1296,9 +1297,9 @@ class ChartView(object):
         # Try to view the new edge..
         y = (level + 1) * self._chart_level_size
         dy = self._text_height + 10
-        self._chart_canvas.yview("moveto", 1.0)
+        self._chart_canvas.yview('moveto', 1.0)
         if self._chart_height != 0:
-            self._chart_canvas.yview("moveto", (y - dy) / self._chart_height)
+            self._chart_canvas.yview('moveto', (y - dy) / self._chart_height)
 
     def _draw_edge(self, edge, lvl):
         """
@@ -1312,7 +1313,7 @@ class ChartView(object):
         if x2 == x1:
             x2 += max(4, self._unitsize / 5)
         y = (lvl + 1) * self._chart_level_size
-        linetag = c.create_line(x1, y, x2, y, arrow="last", width=3)
+        linetag = c.create_line(x1, y, x2, y, arrow='last', width=3)
 
         # Draw a label for the edge.
         if isinstance(edge, TreeEdge):
@@ -1329,13 +1330,13 @@ class ChartView(object):
 
         rhs1 = " ".join(rhs[:pos])
         rhs2 = " ".join(rhs[pos:])
-        rhstag1 = c.create_text(x1 + 3, y, text=rhs1, font=self._font, anchor="nw")
+        rhstag1 = c.create_text(x1 + 3, y, text=rhs1, font=self._font, anchor='nw')
         dotx = c.bbox(rhstag1)[2] + 6
         doty = (c.bbox(rhstag1)[1] + c.bbox(rhstag1)[3]) / 2
         dottag = c.create_oval(dotx - 2, doty - 2, dotx + 2, doty + 2)
-        rhstag2 = c.create_text(dotx + 6, y, text=rhs2, font=self._font, anchor="nw")
+        rhstag2 = c.create_text(dotx + 6, y, text=rhs2, font=self._font, anchor='nw')
         lhstag = c.create_text(
-            (x1 + x2) / 2, y, text=str(edge.lhs()), anchor="s", font=self._boldfont
+            (x1 + x2) / 2, y, text=str(edge.lhs()), anchor='s', font=self._boldfont
         )
 
         # Keep track of the edge's tags.
@@ -1343,13 +1344,13 @@ class ChartView(object):
 
         # Register a callback for clicking on the edge.
         def cb(event, self=self, edge=edge):
-            self._fire_callbacks("select", edge)
+            self._fire_callbacks('select', edge)
 
-        c.tag_bind(rhstag1, "<Button-1>", cb)
-        c.tag_bind(rhstag2, "<Button-1>", cb)
-        c.tag_bind(linetag, "<Button-1>", cb)
-        c.tag_bind(dottag, "<Button-1>", cb)
-        c.tag_bind(lhstag, "<Button-1>", cb)
+        c.tag_bind(rhstag1, '<Button-1>', cb)
+        c.tag_bind(rhstag2, '<Button-1>', cb)
+        c.tag_bind(linetag, '<Button-1>', cb)
+        c.tag_bind(dottag, '<Button-1>', cb)
+        c.tag_bind(lhstag, '<Button-1>', cb)
 
         self._color_edge(edge)
 
@@ -1378,13 +1379,13 @@ class ChartView(object):
             if edge in self._marks:
                 self._color_edge(self._marks[edge])
             if edge.is_complete() and edge.span() == (0, N):
-                self._color_edge(edge, "#084", "#042")
+                self._color_edge(edge, '#084', '#042')
             elif isinstance(edge, LeafEdge):
-                self._color_edge(edge, "#48c", "#246")
+                self._color_edge(edge, '#48c', '#246')
             else:
-                self._color_edge(edge, "#00f", "#008")
+                self._color_edge(edge, '#00f', '#008')
 
-    def mark_edge(self, edge, mark="#0df"):
+    def mark_edge(self, edge, mark='#0df'):
         """
         Mark an edge
         """
@@ -1404,7 +1405,7 @@ class ChartView(object):
             del self._marks[edge]
             self._color_edge(edge)
 
-    def markonly_edge(self, edge, mark="#0df"):
+    def markonly_edge(self, edge, mark='#0df'):
         self.unmark_edge()
         self.mark_edge(edge, mark)
 
@@ -1421,7 +1422,7 @@ class ChartView(object):
         # Check against all tokens
         for leaf in self._chart.leaves():
             tag = c.create_text(
-                0, 0, text=repr(leaf), font=self._font, anchor="nw", justify="left"
+                0, 0, text=repr(leaf), font=self._font, anchor='nw', justify='left'
             )
             bbox = c.bbox(tag)
             c.delete(tag)
@@ -1460,11 +1461,11 @@ class ChartView(object):
 
         levels = len(self._edgelevels)
         self._chart_height = (levels + 2) * self._chart_level_size
-        c["scrollregion"] = (0, 0, width, self._chart_height)
+        c['scrollregion'] = (0, 0, width, self._chart_height)
 
         # Reset the tree scroll region
         if self._tree_canvas:
-            self._tree_canvas["scrollregion"] = (0, 0, width, self._tree_height)
+            self._tree_canvas['scrollregion'] = (0, 0, width, self._tree_height)
 
     def _draw_loclines(self):
         """
@@ -1488,7 +1489,7 @@ class ChartView(object):
                 c2.tag_lower(t2)
             t3 = c3.create_line(x, 0, x, BOTTOM)
             c3.tag_lower(t3)
-            t4 = c3.create_text(x + 2, 0, text=repr(i), anchor="nw", font=self._font)
+            t4 = c3.create_text(x + 2, 0, text=repr(i), anchor='nw', font=self._font)
             c3.tag_lower(t4)
             # if i % 4 == 0:
             #    if c1: c1.itemconfig(t1, width=2, fill='gray60')
@@ -1496,16 +1497,16 @@ class ChartView(object):
             #    c3.itemconfig(t3, width=2, fill='gray60')
             if i % 2 == 0:
                 if c1:
-                    c1.itemconfig(t1, fill="gray60")
+                    c1.itemconfig(t1, fill='gray60')
                 if c2:
-                    c2.itemconfig(t2, fill="gray60")
-                c3.itemconfig(t3, fill="gray60")
+                    c2.itemconfig(t2, fill='gray60')
+                c3.itemconfig(t3, fill='gray60')
             else:
                 if c1:
-                    c1.itemconfig(t1, fill="gray80")
+                    c1.itemconfig(t1, fill='gray80')
                 if c2:
-                    c2.itemconfig(t2, fill="gray80")
-                c3.itemconfig(t3, fill="gray80")
+                    c2.itemconfig(t2, fill='gray80')
+                c3.itemconfig(t3, fill='gray80')
 
     def _draw_sentence(self):
         """Draw the sentence string."""
@@ -1520,7 +1521,7 @@ class ChartView(object):
             x2 = x1 + self._unitsize
             x = (x1 + x2) / 2
             tag = c.create_text(
-                x, y, text=repr(leaf), font=self._font, anchor="n", justify="left"
+                x, y, text=repr(leaf), font=self._font, anchor='n', justify='left'
             )
             bbox = c.bbox(tag)
             rt = c.create_rectangle(
@@ -1528,8 +1529,8 @@ class ChartView(object):
                 bbox[1] - (ChartView._LEAF_SPACING / 2),
                 x2 - 2,
                 bbox[3] + (ChartView._LEAF_SPACING / 2),
-                fill="#f0f0f0",
-                outline="#f0f0f0",
+                fill='#f0f0f0',
+                outline='#f0f0f0',
             )
             c.tag_lower(rt)
 
@@ -1570,7 +1571,7 @@ class ChartView(object):
         # Update the scroll region.
         w = self._chart.num_leaves() * self._unitsize + 2 * ChartView._MARGIN
         h = tree.height() * (ChartView._TREE_LEVEL_SIZE + self._text_height)
-        self._tree_canvas["scrollregion"] = (0, 0, w, h)
+        self._tree_canvas['scrollregion'] = (0, 0, w, h)
 
     def cycle_tree(self):
         self._treetoks_index = (self._treetoks_index + 1) % len(self._treetoks)
@@ -1581,11 +1582,11 @@ class ChartView(object):
             return
 
         # Draw the label.
-        label = "%d Trees" % len(self._treetoks)
+        label = '%d Trees' % len(self._treetoks)
         c = self._tree_canvas
         margin = ChartView._MARGIN
         right = self._chart.num_leaves() * self._unitsize + margin - 2
-        tag = c.create_text(right, 2, anchor="ne", text=label, font=self._boldfont)
+        tag = c.create_text(right, 2, anchor='ne', text=label, font=self._boldfont)
         self._tree_tags.append(tag)
         _, _, _, y = c.bbox(tag)
 
@@ -1593,11 +1594,11 @@ class ChartView(object):
         for i in range(len(self._treetoks)):
             x = right - 20 * (len(self._treetoks) - i - 1)
             if i == self._treetoks_index:
-                fill = "#084"
+                fill = '#084'
             else:
-                fill = "#fff"
+                fill = '#fff'
             tag = c.create_polygon(
-                x, y + 10, x - 5, y, x - 10, y + 10, fill=fill, outline="black"
+                x, y + 10, x - 5, y, x - 10, y + 10, fill=fill, outline='black'
             )
             self._tree_tags.append(tag)
 
@@ -1607,7 +1608,7 @@ class ChartView(object):
                 self._treetoks_index = i
                 self.draw_tree()
 
-            c.tag_bind(tag, "<Button-1>", cb)
+            c.tag_bind(tag, '<Button-1>', cb)
 
     def _draw_treetok(self, treetok, index, depth=0):
         """
@@ -1641,10 +1642,10 @@ class ChartView(object):
         tag = c.create_text(
             nodex,
             nodey,
-            anchor="n",
-            justify="center",
+            anchor='n',
+            justify='center',
             text=str(treetok.label()),
-            fill="#042",
+            fill='#042',
             font=self._boldfont,
         )
         self._tree_tags.append(tag)
@@ -1660,7 +1661,7 @@ class ChartView(object):
                     childx,
                     childy,
                     width=2,
-                    fill="#084",
+                    fill='#084',
                 )
                 self._tree_tags.append(tag)
             if isinstance(child, Tree) and not child:
@@ -1671,8 +1672,8 @@ class ChartView(object):
                     childx,
                     childy,
                     width=2,
-                    fill="#048",
-                    dash="2 3",
+                    fill='#048',
+                    dash='2 3',
                 )
                 self._tree_tags.append(tag)
             if not isinstance(child, Tree):
@@ -1683,7 +1684,7 @@ class ChartView(object):
                     childx,
                     10000,
                     width=2,
-                    fill="#084",
+                    fill='#084',
                 )
                 self._tree_tags.append(tag)
 
@@ -1694,14 +1695,14 @@ class ChartView(object):
         Draw everything (from scratch).
         """
         if self._tree_canvas:
-            self._tree_canvas.delete("all")
+            self._tree_canvas.delete('all')
             self.draw_tree()
 
         if self._sentence_canvas:
-            self._sentence_canvas.delete("all")
+            self._sentence_canvas.delete('all')
             self._draw_sentence()
 
-        self._chart_canvas.delete("all")
+        self._chart_canvas.delete('all')
         self._edgetags = {}
 
         # Redraw any edges we erased.
@@ -1785,7 +1786,7 @@ class FundamentalEdgeRule(EdgeRule, SingleEdgeFundamentalRule):
 
 
 class ChartParserApp(object):
-    def __init__(self, grammar, tokens, title="Chart Parser Application"):
+    def __init__(self, grammar, tokens, title='Chart Parser Application'):
         # Initialize the parser
         self._init_parser(grammar, tokens)
 
@@ -1794,15 +1795,15 @@ class ChartParserApp(object):
             # Create the root window.
             self._root = Tk()
             self._root.title(title)
-            self._root.bind("<Control-q>", self.destroy)
+            self._root.bind('<Control-q>', self.destroy)
 
             # Set up some frames.
             frame3 = Frame(self._root)
             frame2 = Frame(self._root)
             frame1 = Frame(self._root)
-            frame3.pack(side="bottom", fill="none")
-            frame2.pack(side="bottom", fill="x")
-            frame1.pack(side="bottom", fill="both", expand=1)
+            frame3.pack(side='bottom', fill='none')
+            frame2.pack(side='bottom', fill='x')
+            frame1.pack(side='bottom', fill='both', expand=1)
 
             self._init_fonts(self._root)
             self._init_animation()
@@ -1818,7 +1819,7 @@ class ChartParserApp(object):
             self._init_bindings()
 
         except:
-            print("Error creating Tree View")
+            print('Error creating Tree View')
             self.destroy()
             raise
 
@@ -1870,10 +1871,10 @@ class ChartParserApp(object):
 
         # TWhat's our font size (default=same as sysfont)
         self._size = IntVar(root)
-        self._size.set(self._sysfont.cget("size"))
+        self._size.set(self._sysfont.cget('size'))
 
-        self._boldfont = Font(family="helvetica", weight="bold", size=self._size.get())
-        self._font = Font(family="helvetica", size=self._size.get())
+        self._boldfont = Font(family='helvetica', weight='bold', size=self._size.get())
+        self._font = Font(family='helvetica', size=self._size.get())
 
     def _init_animation(self):
         # Are we stepping? (default=yes)
@@ -1889,214 +1890,214 @@ class ChartParserApp(object):
 
     def _init_chartview(self, parent):
         self._cv = ChartView(self._chart, parent, draw_tree=1, draw_sentence=1)
-        self._cv.add_callback("select", self._click_cv_edge)
+        self._cv.add_callback('select', self._click_cv_edge)
 
     def _init_rulelabel(self, parent):
-        ruletxt = "Last edge generated by:"
+        ruletxt = 'Last edge generated by:'
 
         self._rulelabel1 = Label(parent, text=ruletxt, font=self._boldfont)
         self._rulelabel2 = Label(
-            parent, width=40, relief="groove", anchor="w", font=self._boldfont
+            parent, width=40, relief='groove', anchor='w', font=self._boldfont
         )
-        self._rulelabel1.pack(side="left")
-        self._rulelabel2.pack(side="left")
-        step = Checkbutton(parent, variable=self._step, text="Step")
-        step.pack(side="right")
+        self._rulelabel1.pack(side='left')
+        self._rulelabel2.pack(side='left')
+        step = Checkbutton(parent, variable=self._step, text='Step')
+        step.pack(side='right')
 
     def _init_buttons(self, parent):
         frame1 = Frame(parent)
         frame2 = Frame(parent)
-        frame1.pack(side="bottom", fill="x")
-        frame2.pack(side="top", fill="none")
+        frame1.pack(side='bottom', fill='x')
+        frame2.pack(side='top', fill='none')
 
         Button(
             frame1,
-            text="Reset\nParser",
-            background="#90c0d0",
-            foreground="black",
+            text='Reset\nParser',
+            background='#90c0d0',
+            foreground='black',
             command=self.reset,
-        ).pack(side="right")
+        ).pack(side='right')
         # Button(frame1, text='Pause',
         #               background='#90c0d0', foreground='black',
         #               command=self.pause).pack(side='left')
 
         Button(
             frame1,
-            text="Top Down\nStrategy",
-            background="#90c0d0",
-            foreground="black",
+            text='Top Down\nStrategy',
+            background='#90c0d0',
+            foreground='black',
             command=self.top_down_strategy,
-        ).pack(side="left")
+        ).pack(side='left')
         Button(
             frame1,
-            text="Bottom Up\nStrategy",
-            background="#90c0d0",
-            foreground="black",
+            text='Bottom Up\nStrategy',
+            background='#90c0d0',
+            foreground='black',
             command=self.bottom_up_strategy,
-        ).pack(side="left")
+        ).pack(side='left')
         Button(
             frame1,
-            text="Bottom Up\nLeft-Corner Strategy",
-            background="#90c0d0",
-            foreground="black",
+            text='Bottom Up\nLeft-Corner Strategy',
+            background='#90c0d0',
+            foreground='black',
             command=self.bottom_up_leftcorner_strategy,
-        ).pack(side="left")
+        ).pack(side='left')
 
         Button(
             frame2,
-            text="Top Down Init\nRule",
-            background="#90f090",
-            foreground="black",
+            text='Top Down Init\nRule',
+            background='#90f090',
+            foreground='black',
             command=self.top_down_init,
-        ).pack(side="left")
+        ).pack(side='left')
         Button(
             frame2,
-            text="Top Down Predict\nRule",
-            background="#90f090",
-            foreground="black",
+            text='Top Down Predict\nRule',
+            background='#90f090',
+            foreground='black',
             command=self.top_down_predict,
-        ).pack(side="left")
-        Frame(frame2, width=20).pack(side="left")
+        ).pack(side='left')
+        Frame(frame2, width=20).pack(side='left')
 
         Button(
             frame2,
-            text="Bottom Up Predict\nRule",
-            background="#90f090",
-            foreground="black",
+            text='Bottom Up Predict\nRule',
+            background='#90f090',
+            foreground='black',
             command=self.bottom_up,
-        ).pack(side="left")
-        Frame(frame2, width=20).pack(side="left")
+        ).pack(side='left')
+        Frame(frame2, width=20).pack(side='left')
 
         Button(
             frame2,
-            text="Bottom Up Left-Corner\nPredict Rule",
-            background="#90f090",
-            foreground="black",
+            text='Bottom Up Left-Corner\nPredict Rule',
+            background='#90f090',
+            foreground='black',
             command=self.bottom_up_leftcorner,
-        ).pack(side="left")
-        Frame(frame2, width=20).pack(side="left")
+        ).pack(side='left')
+        Frame(frame2, width=20).pack(side='left')
 
         Button(
             frame2,
-            text="Fundamental\nRule",
-            background="#90f090",
-            foreground="black",
+            text='Fundamental\nRule',
+            background='#90f090',
+            foreground='black',
             command=self.fundamental,
-        ).pack(side="left")
+        ).pack(side='left')
 
     def _init_bindings(self):
-        self._root.bind("<Up>", self._cv.scroll_up)
-        self._root.bind("<Down>", self._cv.scroll_down)
-        self._root.bind("<Prior>", self._cv.page_up)
-        self._root.bind("<Next>", self._cv.page_down)
-        self._root.bind("<Control-q>", self.destroy)
-        self._root.bind("<Control-x>", self.destroy)
-        self._root.bind("<F1>", self.help)
-
-        self._root.bind("<Control-s>", self.save_chart)
-        self._root.bind("<Control-o>", self.load_chart)
-        self._root.bind("<Control-r>", self.reset)
-
-        self._root.bind("t", self.top_down_strategy)
-        self._root.bind("b", self.bottom_up_strategy)
-        self._root.bind("c", self.bottom_up_leftcorner_strategy)
-        self._root.bind("<space>", self._stop_animation)
-
-        self._root.bind("<Control-g>", self.edit_grammar)
-        self._root.bind("<Control-t>", self.edit_sentence)
+        self._root.bind('<Up>', self._cv.scroll_up)
+        self._root.bind('<Down>', self._cv.scroll_down)
+        self._root.bind('<Prior>', self._cv.page_up)
+        self._root.bind('<Next>', self._cv.page_down)
+        self._root.bind('<Control-q>', self.destroy)
+        self._root.bind('<Control-x>', self.destroy)
+        self._root.bind('<F1>', self.help)
+
+        self._root.bind('<Control-s>', self.save_chart)
+        self._root.bind('<Control-o>', self.load_chart)
+        self._root.bind('<Control-r>', self.reset)
+
+        self._root.bind('t', self.top_down_strategy)
+        self._root.bind('b', self.bottom_up_strategy)
+        self._root.bind('c', self.bottom_up_leftcorner_strategy)
+        self._root.bind('<space>', self._stop_animation)
+
+        self._root.bind('<Control-g>', self.edit_grammar)
+        self._root.bind('<Control-t>', self.edit_sentence)
 
         # Animation speed control
-        self._root.bind("-", lambda e, a=self._animate: a.set(1))
-        self._root.bind("=", lambda e, a=self._animate: a.set(2))
-        self._root.bind("+", lambda e, a=self._animate: a.set(3))
+        self._root.bind('-', lambda e, a=self._animate: a.set(1))
+        self._root.bind('=', lambda e, a=self._animate: a.set(2))
+        self._root.bind('+', lambda e, a=self._animate: a.set(3))
 
         # Step control
-        self._root.bind("s", lambda e, s=self._step: s.set(not s.get()))
+        self._root.bind('s', lambda e, s=self._step: s.set(not s.get()))
 
     def _init_menubar(self):
         menubar = Menu(self._root)
 
         filemenu = Menu(menubar, tearoff=0)
         filemenu.add_command(
-            label="Save Chart",
+            label='Save Chart',
             underline=0,
             command=self.save_chart,
-            accelerator="Ctrl-s",
+            accelerator='Ctrl-s',
         )
         filemenu.add_command(
-            label="Load Chart",
+            label='Load Chart',
             underline=0,
             command=self.load_chart,
-            accelerator="Ctrl-o",
+            accelerator='Ctrl-o',
         )
         filemenu.add_command(
-            label="Reset Chart", underline=0, command=self.reset, accelerator="Ctrl-r"
+            label='Reset Chart', underline=0, command=self.reset, accelerator='Ctrl-r'
         )
         filemenu.add_separator()
-        filemenu.add_command(label="Save Grammar", command=self.save_grammar)
-        filemenu.add_command(label="Load Grammar", command=self.load_grammar)
+        filemenu.add_command(label='Save Grammar', command=self.save_grammar)
+        filemenu.add_command(label='Load Grammar', command=self.load_grammar)
         filemenu.add_separator()
         filemenu.add_command(
-            label="Exit", underline=1, command=self.destroy, accelerator="Ctrl-x"
+            label='Exit', underline=1, command=self.destroy, accelerator='Ctrl-x'
         )
-        menubar.add_cascade(label="File", underline=0, menu=filemenu)
+        menubar.add_cascade(label='File', underline=0, menu=filemenu)
 
         editmenu = Menu(menubar, tearoff=0)
         editmenu.add_command(
-            label="Edit Grammar",
+            label='Edit Grammar',
             underline=5,
             command=self.edit_grammar,
-            accelerator="Ctrl-g",
+            accelerator='Ctrl-g',
         )
         editmenu.add_command(
-            label="Edit Text",
+            label='Edit Text',
             underline=5,
             command=self.edit_sentence,
-            accelerator="Ctrl-t",
+            accelerator='Ctrl-t',
         )
-        menubar.add_cascade(label="Edit", underline=0, menu=editmenu)
+        menubar.add_cascade(label='Edit', underline=0, menu=editmenu)
 
         viewmenu = Menu(menubar, tearoff=0)
         viewmenu.add_command(
-            label="Chart Matrix", underline=6, command=self.view_matrix
+            label='Chart Matrix', underline=6, command=self.view_matrix
         )
-        viewmenu.add_command(label="Results", underline=0, command=self.view_results)
-        menubar.add_cascade(label="View", underline=0, menu=viewmenu)
+        viewmenu.add_command(label='Results', underline=0, command=self.view_results)
+        menubar.add_cascade(label='View', underline=0, menu=viewmenu)
 
         rulemenu = Menu(menubar, tearoff=0)
         rulemenu.add_command(
-            label="Top Down Strategy",
+            label='Top Down Strategy',
             underline=0,
             command=self.top_down_strategy,
-            accelerator="t",
+            accelerator='t',
         )
         rulemenu.add_command(
-            label="Bottom Up Strategy",
+            label='Bottom Up Strategy',
             underline=0,
             command=self.bottom_up_strategy,
-            accelerator="b",
+            accelerator='b',
         )
         rulemenu.add_command(
-            label="Bottom Up Left-Corner Strategy",
+            label='Bottom Up Left-Corner Strategy',
             underline=0,
             command=self.bottom_up_leftcorner_strategy,
-            accelerator="c",
+            accelerator='c',
         )
         rulemenu.add_separator()
-        rulemenu.add_command(label="Bottom Up Rule", command=self.bottom_up)
+        rulemenu.add_command(label='Bottom Up Rule', command=self.bottom_up)
         rulemenu.add_command(
-            label="Bottom Up Left-Corner Rule", command=self.bottom_up_leftcorner
+            label='Bottom Up Left-Corner Rule', command=self.bottom_up_leftcorner
         )
-        rulemenu.add_command(label="Top Down Init Rule", command=self.top_down_init)
+        rulemenu.add_command(label='Top Down Init Rule', command=self.top_down_init)
         rulemenu.add_command(
-            label="Top Down Predict Rule", command=self.top_down_predict
+            label='Top Down Predict Rule', command=self.top_down_predict
         )
-        rulemenu.add_command(label="Fundamental Rule", command=self.fundamental)
-        menubar.add_cascade(label="Apply", underline=0, menu=rulemenu)
+        rulemenu.add_command(label='Fundamental Rule', command=self.fundamental)
+        menubar.add_cascade(label='Apply', underline=0, menu=rulemenu)
 
         animatemenu = Menu(menubar, tearoff=0)
         animatemenu.add_checkbutton(
-            label="Step", underline=0, variable=self._step, accelerator="s"
+            label="Step", underline=0, variable=self._step, accelerator='s'
         )
         animatemenu.add_separator()
         animatemenu.add_radiobutton(
@@ -2107,68 +2108,68 @@ class ChartParserApp(object):
             underline=0,
             variable=self._animate,
             value=1,
-            accelerator="-",
+            accelerator='-',
         )
         animatemenu.add_radiobutton(
             label="Normal Animation",
             underline=0,
             variable=self._animate,
             value=2,
-            accelerator="=",
+            accelerator='=',
         )
         animatemenu.add_radiobutton(
             label="Fast Animation",
             underline=0,
             variable=self._animate,
             value=3,
-            accelerator="+",
+            accelerator='+',
         )
         menubar.add_cascade(label="Animate", underline=1, menu=animatemenu)
 
         zoommenu = Menu(menubar, tearoff=0)
         zoommenu.add_radiobutton(
-            label="Tiny",
+            label='Tiny',
             variable=self._size,
             underline=0,
             value=10,
             command=self.resize,
         )
         zoommenu.add_radiobutton(
-            label="Small",
+            label='Small',
             variable=self._size,
             underline=0,
             value=12,
             command=self.resize,
         )
         zoommenu.add_radiobutton(
-            label="Medium",
+            label='Medium',
             variable=self._size,
             underline=0,
             value=14,
             command=self.resize,
         )
         zoommenu.add_radiobutton(
-            label="Large",
+            label='Large',
             variable=self._size,
             underline=0,
             value=18,
             command=self.resize,
         )
         zoommenu.add_radiobutton(
-            label="Huge",
+            label='Huge',
             variable=self._size,
             underline=0,
             value=24,
             command=self.resize,
         )
-        menubar.add_cascade(label="Zoom", underline=0, menu=zoommenu)
+        menubar.add_cascade(label='Zoom', underline=0, menu=zoommenu)
 
         helpmenu = Menu(menubar, tearoff=0)
-        helpmenu.add_command(label="About", underline=0, command=self.about)
+        helpmenu.add_command(label='About', underline=0, command=self.about)
         helpmenu.add_command(
-            label="Instructions", underline=0, command=self.help, accelerator="F1"
+            label='Instructions', underline=0, command=self.help, accelerator='F1'
         )
-        menubar.add_cascade(label="Help", underline=0, menu=helpmenu)
+        menubar.add_cascade(label='Help', underline=0, menu=helpmenu)
 
         self._root.config(menu=menubar)
 
@@ -2193,7 +2194,7 @@ class ChartParserApp(object):
     def _select_edge(self, edge):
         self._selection = edge
         # Update the chart view.
-        self._cv.markonly_edge(edge, "#f00")
+        self._cv.markonly_edge(edge, '#f00')
         self._cv.draw_tree(edge)
         # Update the matrix view.
         if self._matrix:
@@ -2215,7 +2216,7 @@ class ChartParserApp(object):
         # Update the chart view.
         self._cv.update()
         self._cv.draw_tree(edge)
-        self._cv.markonly_edge(edge, "#0df")
+        self._cv.markonly_edge(edge, '#0df')
         self._cv.view_edge(edge)
         # Update the matrix view.
         if self._matrix:
@@ -2238,43 +2239,43 @@ class ChartParserApp(object):
         try:
             ShowText(
                 self._root,
-                "Help: Chart Parser Application",
-                (__doc__ or "").strip(),
+                'Help: Chart Parser Application',
+                (__doc__ or '').strip(),
                 width=75,
-                font="fixed",
+                font='fixed',
             )
         except:
             ShowText(
                 self._root,
-                "Help: Chart Parser Application",
-                (__doc__ or "").strip(),
+                'Help: Chart Parser Application',
+                (__doc__ or '').strip(),
                 width=75,
             )
 
     def about(self, *e):
         ABOUT = "NLTK Chart Parser Application\n" + "Written by Edward Loper"
-        showinfo("About: Chart Parser Application", ABOUT)
+        showinfo('About: Chart Parser Application', ABOUT)
 
     # ////////////////////////////////////////////////////////////
     # File Menu
     # ////////////////////////////////////////////////////////////
 
-    CHART_FILE_TYPES = [("Pickle file", ".pickle"), ("All files", "*")]
+    CHART_FILE_TYPES = [('Pickle file', '.pickle'), ('All files', '*')]
     GRAMMAR_FILE_TYPES = [
-        ("Plaintext grammar file", ".cfg"),
-        ("Pickle file", ".pickle"),
-        ("All files", "*"),
+        ('Plaintext grammar file', '.cfg'),
+        ('Pickle file', '.pickle'),
+        ('All files', '*'),
     ]
 
     def load_chart(self, *args):
         "Load a chart from a pickle file"
         filename = askopenfilename(
-            filetypes=self.CHART_FILE_TYPES, defaultextension=".pickle"
+            filetypes=self.CHART_FILE_TYPES, defaultextension='.pickle'
         )
         if not filename:
             return
         try:
-            with open(filename, "rb") as infile:
+            with open(filename, 'rb') as infile:
                 chart = pickle.load(infile)
             self._chart = chart
             self._cv.update(chart)
@@ -2287,61 +2288,61 @@ class ChartParserApp(object):
             self._cp.set_chart(chart)
         except Exception as e:
             raise
-            showerror("Error Loading Chart", "Unable to open file: %r" % filename)
+            showerror('Error Loading Chart', 'Unable to open file: %r' % filename)
 
     def save_chart(self, *args):
         "Save a chart to a pickle file"
         filename = asksaveasfilename(
-            filetypes=self.CHART_FILE_TYPES, defaultextension=".pickle"
+            filetypes=self.CHART_FILE_TYPES, defaultextension='.pickle'
         )
         if not filename:
             return
         try:
-            with open(filename, "wb") as outfile:
+            with open(filename, 'wb') as outfile:
                 pickle.dump(self._chart, outfile)
         except Exception as e:
             raise
-            showerror("Error Saving Chart", "Unable to open file: %r" % filename)
+            showerror('Error Saving Chart', 'Unable to open file: %r' % filename)
 
     def load_grammar(self, *args):
         "Load a grammar from a pickle file"
         filename = askopenfilename(
-            filetypes=self.GRAMMAR_FILE_TYPES, defaultextension=".cfg"
+            filetypes=self.GRAMMAR_FILE_TYPES, defaultextension='.cfg'
         )
         if not filename:
             return
         try:
-            if filename.endswith(".pickle"):
-                with open(filename, "rb") as infile:
+            if filename.endswith('.pickle'):
+                with open(filename, 'rb') as infile:
                     grammar = pickle.load(infile)
             else:
-                with open(filename, "r") as infile:
+                with open(filename, 'r') as infile:
                     grammar = CFG.fromstring(infile.read())
             self.set_grammar(grammar)
         except Exception as e:
-            showerror("Error Loading Grammar", "Unable to open file: %r" % filename)
+            showerror('Error Loading Grammar', 'Unable to open file: %r' % filename)
 
     def save_grammar(self, *args):
         filename = asksaveasfilename(
-            filetypes=self.GRAMMAR_FILE_TYPES, defaultextension=".cfg"
+            filetypes=self.GRAMMAR_FILE_TYPES, defaultextension='.cfg'
         )
         if not filename:
             return
         try:
-            if filename.endswith(".pickle"):
-                with open(filename, "wb") as outfile:
+            if filename.endswith('.pickle'):
+                with open(filename, 'wb') as outfile:
                     pickle.dump((self._chart, self._tokens), outfile)
             else:
-                with open(filename, "w") as outfile:
+                with open(filename, 'w') as outfile:
                     prods = self._grammar.productions()
                     start = [p for p in prods if p.lhs() == self._grammar.start()]
                     rest = [p for p in prods if p.lhs() != self._grammar.start()]
                     for prod in start:
-                        outfile.write("%s\n" % prod)
+                        outfile.write('%s\n' % prod)
                     for prod in rest:
-                        outfile.write("%s\n" % prod)
+                        outfile.write('%s\n' % prod)
         except Exception as e:
-            showerror("Error Saving Grammar", "Unable to open file: %r" % filename)
+            showerror('Error Saving Grammar', 'Unable to open file: %r' % filename)
 
     def reset(self, *args):
         self._animating = 0
@@ -2369,8 +2370,8 @@ class ChartParserApp(object):
 
     def edit_sentence(self, *e):
         sentence = " ".join(self._tokens)
-        title = "Edit Text"
-        instr = "Enter a new sentence to parse."
+        title = 'Edit Text'
+        instr = 'Enter a new sentence to parse.'
         EntryDialog(self._root, sentence, instr, self.set_sentence, title)
 
     def set_sentence(self, sentence):
@@ -2385,7 +2386,7 @@ class ChartParserApp(object):
         if self._matrix is not None:
             self._matrix.destroy()
         self._matrix = ChartMatrixView(self._root, self._chart)
-        self._matrix.add_callback("select", self._select_matrix_edge)
+        self._matrix.add_callback('select', self._select_matrix_edge)
 
     def view_results(self, *e):
         if self._results is not None:
@@ -2478,10 +2479,10 @@ class ChartParserApp(object):
 
     def _display_rule(self, rule):
         if rule is None:
-            self._rulelabel2["text"] = ""
+            self._rulelabel2['text'] = ''
         else:
             name = str(rule)
-            self._rulelabel2["text"] = name
+            self._rulelabel2['text'] = name
             size = self._cv.get_font_size()
 
     # ////////////////////////////////////////////////////////////
@@ -2543,20 +2544,20 @@ def app():
     """
     )
 
-    sent = "John ate the cake on the table with a fork"
-    sent = "John ate the cake on the table"
+    sent = 'John ate the cake on the table with a fork'
+    sent = 'John ate the cake on the table'
     tokens = list(sent.split())
 
-    print("grammar= (")
+    print('grammar= (')
     for rule in grammar.productions():
-        print(("    ", repr(rule) + ","))
-    print(")")
-    print(("tokens = %r" % tokens))
+        print(('    ', repr(rule) + ','))
+    print(')')
+    print(('tokens = %r' % tokens))
     print('Calling "ChartParserApp(grammar, tokens)"...')
     ChartParserApp(grammar, tokens).mainloop()
 
 
-if __name__ == "__main__":
+if __name__ == '__main__':
     app()
 
     # Chart comparer:
@@ -2572,4 +2573,4 @@ if __name__ == "__main__":
     # p.strip_dirs().sort_stats('time', 'cum').print_stats(60)
     # p.strip_dirs().sort_stats('cum', 'time').print_stats(60)
 
-__all__ = ["app"]
+__all__ = ['app']
index 699b7d7..2aeca10 100644 (file)
@@ -1,6 +1,6 @@
 # Natural Language Toolkit: Regexp Chunk Parser Application
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Edward Loper <edloper@gmail.com>
 # URL: <http://nltk.org/>
 # For license information, see LICENSE.TXT
@@ -15,12 +15,13 @@ parser ``nltk.chunk.RegexpChunkParser``.
 # configuration parameters to select what's being chunked (eg VP vs NP)
 # and what part of the data is being used as the development set.
 
+from __future__ import division
 import time
 import textwrap
 import re
 import random
 
-from tkinter import (
+from six.moves.tkinter import (
     Button,
     Canvas,
     Checkbutton,
@@ -32,8 +33,8 @@ from tkinter import (
     Text,
     Tk,
 )
-from tkinter.filedialog import askopenfilename, asksaveasfilename
-from tkinter.font import Font
+from six.moves.tkinter_tkfiledialog import askopenfilename, asksaveasfilename
+from six.moves.tkinter_font import Font
 
 from nltk.tree import Tree
 from nltk.util import in_idle
@@ -59,51 +60,51 @@ class RegexpChunkApp(object):
     #: which is used in the help text.  (This should probably live with
     #: the conll and/or treebank corpus instead.)
     TAGSET = {
-        "CC": "Coordinating conjunction",
-        "PRP$": "Possessive pronoun",
-        "CD": "Cardinal number",
-        "RB": "Adverb",
-        "DT": "Determiner",
-        "RBR": "Adverb, comparative",
-        "EX": "Existential there",
-        "RBS": "Adverb, superlative",
-        "FW": "Foreign word",
-        "RP": "Particle",
-        "JJ": "Adjective",
-        "TO": "to",
-        "JJR": "Adjective, comparative",
-        "UH": "Interjection",
-        "JJS": "Adjective, superlative",
-        "VB": "Verb, base form",
-        "LS": "List item marker",
-        "VBD": "Verb, past tense",
-        "MD": "Modal",
-        "NNS": "Noun, plural",
-        "NN": "Noun, singular or masps",
-        "VBN": "Verb, past participle",
-        "VBZ": "Verb,3rd ps. sing. present",
-        "NNP": "Proper noun, singular",
-        "NNPS": "Proper noun plural",
-        "WDT": "wh-determiner",
-        "PDT": "Predeterminer",
-        "WP": "wh-pronoun",
-        "POS": "Possessive ending",
-        "WP$": "Possessive wh-pronoun",
-        "PRP": "Personal pronoun",
-        "WRB": "wh-adverb",
-        "(": "open parenthesis",
-        ")": "close parenthesis",
-        "``": "open quote",
-        ",": "comma",
-        "''": "close quote",
-        ".": "period",
-        "#": "pound sign (currency marker)",
-        "$": "dollar sign (currency marker)",
-        "IN": "Preposition/subord. conjunction",
-        "SYM": "Symbol (mathematical or scientific)",
-        "VBG": "Verb, gerund/present participle",
-        "VBP": "Verb, non-3rd ps. sing. present",
-        ":": "colon",
+        'CC': 'Coordinating conjunction',
+        'PRP$': 'Possessive pronoun',
+        'CD': 'Cardinal number',
+        'RB': 'Adverb',
+        'DT': 'Determiner',
+        'RBR': 'Adverb, comparative',
+        'EX': 'Existential there',
+        'RBS': 'Adverb, superlative',
+        'FW': 'Foreign word',
+        'RP': 'Particle',
+        'JJ': 'Adjective',
+        'TO': 'to',
+        'JJR': 'Adjective, comparative',
+        'UH': 'Interjection',
+        'JJS': 'Adjective, superlative',
+        'VB': 'Verb, base form',
+        'LS': 'List item marker',
+        'VBD': 'Verb, past tense',
+        'MD': 'Modal',
+        'NNS': 'Noun, plural',
+        'NN': 'Noun, singular or masps',
+        'VBN': 'Verb, past participle',
+        'VBZ': 'Verb,3rd ps. sing. present',
+        'NNP': 'Proper noun, singular',
+        'NNPS': 'Proper noun plural',
+        'WDT': 'wh-determiner',
+        'PDT': 'Predeterminer',
+        'WP': 'wh-pronoun',
+        'POS': 'Possessive ending',
+        'WP$': 'Possessive wh-pronoun',
+        'PRP': 'Personal pronoun',
+        'WRB': 'wh-adverb',
+        '(': 'open parenthesis',
+        ')': 'close parenthesis',
+        '``': 'open quote',
+        ',': 'comma',
+        "''": 'close quote',
+        '.': 'period',
+        '#': 'pound sign (currency marker)',
+        '$': 'dollar sign (currency marker)',
+        'IN': 'Preposition/subord. conjunction',
+        'SYM': 'Symbol (mathematical or scientific)',
+        'VBG': 'Verb, gerund/present participle',
+        'VBP': 'Verb, non-3rd ps. sing. present',
+        ':': 'colon',
     }
 
     #: Contents for the help box.  This is a list of tuples, one for
@@ -115,8 +116,8 @@ class RegexpChunkApp(object):
     #:     for a list of tags you can use for colorizing.
     HELP = [
         (
-            "Help",
-            "20",
+            'Help',
+            '20',
             "Welcome to the regular expression chunk-parser grammar editor.  "
             "You can use this editor to develop and test chunk parser grammars "
             "based on NLTK's RegexpChunkParser class.\n\n"
@@ -154,8 +155,8 @@ class RegexpChunkApp(object):
             "the status bar at the bottom of the window.",
         ),
         (
-            "Rules",
-            "10",
+            'Rules',
+            '10',
             "<h1>{...regexp...}</h1>"
             "<indent>\nChunk rule: creates new chunks from words matching "
             "regexp.</indent>\n\n"
@@ -170,8 +171,8 @@ class RegexpChunkApp(object):
             "and regexp2</indent>\n",
         ),
         (
-            "Regexps",
-            "10 60",
+            'Regexps',
+            '10 60',
             # "Regular Expression Syntax Summary:\n\n"
             "<h1>Pattern\t\tMatches...</h1>\n"
             "<hangindent>"
@@ -191,42 +192,42 @@ class RegexpChunkApp(object):
             "</hangindent>"
             "\n<h1>Examples:</h1>\n"
             "<hangindent>"
-            "\t<regexp><NN></regexp>\n"
+            '\t<regexp><NN></regexp>\n'
             '\t\tMatches <match>"cow/NN"</match>\n'
             '\t\tMatches <match>"green/NN"</match>\n'
-            "\t<regexp><VB.*></regexp>\n"
+            '\t<regexp><VB.*></regexp>\n'
             '\t\tMatches <match>"eating/VBG"</match>\n'
             '\t\tMatches <match>"ate/VBD"</match>\n'
-            "\t<regexp><IN><DT><NN></regexp>\n"
+            '\t<regexp><IN><DT><NN></regexp>\n'
             '\t\tMatches <match>"on/IN the/DT car/NN"</match>\n'
-            "\t<regexp><RB>?<VBD></regexp>\n"
+            '\t<regexp><RB>?<VBD></regexp>\n'
             '\t\tMatches <match>"ran/VBD"</match>\n'
             '\t\tMatches <match>"slowly/RB ate/VBD"</match>\n'
-            "\t<regexp><\#><CD> # This is a comment...</regexp>\n"
+            '\t<regexp><\#><CD> # This is a comment...</regexp>\n'
             '\t\tMatches <match>"#/# 100/CD"</match>\n'
             "</hangindent>",
         ),
         (
-            "Tags",
-            "10 60",
+            'Tags',
+            '10 60',
             "<h1>Part of Speech Tags:</h1>\n"
-            + "<hangindent>"
-            + "<<TAGSET>>"
-            + "</hangindent>\n",  # this gets auto-substituted w/ self.TAGSET
+            + '<hangindent>'
+            + '<<TAGSET>>'
+            + '</hangindent>\n',  # this gets auto-substituted w/ self.TAGSET
         ),
     ]
 
     HELP_AUTOTAG = [
-        ("red", dict(foreground="#a00")),
-        ("green", dict(foreground="#080")),
-        ("highlight", dict(background="#ddd")),
-        ("underline", dict(underline=True)),
-        ("h1", dict(underline=True)),
-        ("indent", dict(lmargin1=20, lmargin2=20)),
-        ("hangindent", dict(lmargin1=0, lmargin2=60)),
-        ("var", dict(foreground="#88f")),
-        ("regexp", dict(foreground="#ba7")),
-        ("match", dict(foreground="#6a6")),
+        ('red', dict(foreground='#a00')),
+        ('green', dict(foreground='#080')),
+        ('highlight', dict(background='#ddd')),
+        ('underline', dict(underline=True)),
+        ('h1', dict(underline=True)),
+        ('indent', dict(lmargin1=20, lmargin2=20)),
+        ('hangindent', dict(lmargin1=0, lmargin2=60)),
+        ('var', dict(foreground='#88f')),
+        ('regexp', dict(foreground='#ba7')),
+        ('match', dict(foreground='#6a6')),
     ]
 
     ##/////////////////////////////////////////////////////////////////
@@ -255,74 +256,74 @@ class RegexpChunkApp(object):
     _GRAMMARBOX_PARAMS = dict(
         width=40,
         height=12,
-        background="#efe",
-        highlightbackground="#efe",
+        background='#efe',
+        highlightbackground='#efe',
         highlightthickness=1,
-        relief="groove",
+        relief='groove',
         border=2,
-        wrap="word",
+        wrap='word',
     )
     _HELPBOX_PARAMS = dict(
         width=15,
         height=15,
-        background="#efe",
-        highlightbackground="#efe",
-        foreground="#555",
+        background='#efe',
+        highlightbackground='#efe',
+        foreground='#555',
         highlightthickness=1,
-        relief="groove",
+        relief='groove',
         border=2,
-        wrap="word",
+        wrap='word',
     )
     _DEVSETBOX_PARAMS = dict(
         width=70,
         height=10,
-        background="#eef",
-        highlightbackground="#eef",
+        background='#eef',
+        highlightbackground='#eef',
         highlightthickness=1,
-        relief="groove",
+        relief='groove',
         border=2,
-        wrap="word",
+        wrap='word',
         tabs=(30,),
     )
-    _STATUS_PARAMS = dict(background="#9bb", relief="groove", border=2)
-    _FONT_PARAMS = dict(family="helvetica", size=-20)
-    _FRAME_PARAMS = dict(background="#777", padx=2, pady=2, border=3)
+    _STATUS_PARAMS = dict(background='#9bb', relief='groove', border=2)
+    _FONT_PARAMS = dict(family='helvetica', size=-20)
+    _FRAME_PARAMS = dict(background='#777', padx=2, pady=2, border=3)
     _EVALBOX_PARAMS = dict(
-        background="#eef",
-        highlightbackground="#eef",
+        background='#eef',
+        highlightbackground='#eef',
         highlightthickness=1,
-        relief="groove",
+        relief='groove',
         border=2,
         width=300,
         height=280,
     )
     _BUTTON_PARAMS = dict(
-        background="#777", activebackground="#777", highlightbackground="#777"
+        background='#777', activebackground='#777', highlightbackground='#777'
     )
-    _HELPTAB_BG_COLOR = "#aba"
-    _HELPTAB_FG_COLOR = "#efe"
+    _HELPTAB_BG_COLOR = '#aba'
+    _HELPTAB_FG_COLOR = '#efe'
 
-    _HELPTAB_FG_PARAMS = dict(background="#efe")
-    _HELPTAB_BG_PARAMS = dict(background="#aba")
+    _HELPTAB_FG_PARAMS = dict(background='#efe')
+    _HELPTAB_BG_PARAMS = dict(background='#aba')
     _HELPTAB_SPACER = 6
 
     def normalize_grammar(self, grammar):
         # Strip comments
-        grammar = re.sub(r"((\\.|[^#])*)(#.*)?", r"\1", grammar)
+        grammar = re.sub(r'((\\.|[^#])*)(#.*)?', r'\1', grammar)
         # Normalize whitespace
-        grammar = re.sub(" +", " ", grammar)
-        grammar = re.sub("\n\s+", "\n", grammar)
+        grammar = re.sub(' +', ' ', grammar)
+        grammar = re.sub('\n\s+', '\n', grammar)
         grammar = grammar.strip()
         # [xx] Hack: automatically backslash $!
-        grammar = re.sub(r"([^\\])\$", r"\1\\$", grammar)
+        grammar = re.sub(r'([^\\])\$', r'\1\\$', grammar)
         return grammar
 
     def __init__(
         self,
-        devset_name="conll2000",
+        devset_name='conll2000',
         devset=None,
-        grammar="",
-        chunk_label="NP",
+        grammar='',
+        chunk_label='NP',
         tagset=None,
     ):
         """
@@ -343,12 +344,12 @@ class RegexpChunkApp(object):
 
         # Named development sets:
         if devset is None:
-            if devset_name == "conll2000":
-                devset = conll2000.chunked_sents("train.txt")  # [:100]
-            elif devset == "treebank":
+            if devset_name == 'conll2000':
+                devset = conll2000.chunked_sents('train.txt')  # [:100]
+            elif devset == 'treebank':
                 devset = treebank_chunk.chunked_sents()  # [:100]
             else:
-                raise ValueError("Unknown development set %s" % devset_name)
+                raise ValueError('Unknown development set %s' % devset_name)
 
         self.chunker = None
         """The chunker built from the grammar string"""
@@ -400,9 +401,9 @@ class RegexpChunkApp(object):
 
         # Set up the main window.
         top = self.top = Tk()
-        top.geometry("+50+50")
-        top.title("Regexp Chunk Parser App")
-        top.bind("<Control-q>", self.destroy)
+        top.geometry('+50+50')
+        top.title('Regexp Chunk Parser App')
+        top.bind('<Control-q>', self.destroy)
 
         # Varaible that restricts how much of the devset we look at.
         self._devset_size = IntVar(top)
@@ -417,131 +418,131 @@ class RegexpChunkApp(object):
 
         # If a grammar was given, then display it.
         if grammar:
-            self.grammarbox.insert("end", grammar + "\n")
-            self.grammarbox.mark_set("insert", "1.0")
+            self.grammarbox.insert('end', grammar + '\n')
+            self.grammarbox.mark_set('insert', '1.0')
 
         # Display the first item in the development set
         self.show_devset(0)
         self.update()
 
     def _init_bindings(self, top):
-        top.bind("<Control-n>", self._devset_next)
-        top.bind("<Control-p>", self._devset_prev)
-        top.bind("<Control-t>", self.toggle_show_trace)
-        top.bind("<KeyPress>", self.update)
-        top.bind("<Control-s>", lambda e: self.save_grammar())
-        top.bind("<Control-o>", lambda e: self.load_grammar())
-        self.grammarbox.bind("<Control-t>", self.toggle_show_trace)
-        self.grammarbox.bind("<Control-n>", self._devset_next)
-        self.grammarbox.bind("<Control-p>", self._devset_prev)
+        top.bind('<Control-n>', self._devset_next)
+        top.bind('<Control-p>', self._devset_prev)
+        top.bind('<Control-t>', self.toggle_show_trace)
+        top.bind('<KeyPress>', self.update)
+        top.bind('<Control-s>', lambda e: self.save_grammar())
+        top.bind('<Control-o>', lambda e: self.load_grammar())
+        self.grammarbox.bind('<Control-t>', self.toggle_show_trace)
+        self.grammarbox.bind('<Control-n>', self._devset_next)
+        self.grammarbox.bind('<Control-p>', self._devset_prev)
 
         # Redraw the eval graph when the window size changes
-        self.evalbox.bind("<Configure>", self._eval_plot)
+        self.evalbox.bind('<Configure>', self._eval_plot)
 
     def _init_fonts(self, top):
         # TWhat's our font size (default=same as sysfont)
         self._size = IntVar(top)
         self._size.set(20)
-        self._font = Font(family="helvetica", size=-self._size.get())
+        self._font = Font(family='helvetica', size=-self._size.get())
         self._smallfont = Font(
-            family="helvetica", size=-(int(self._size.get() * 14 // 20))
+            family='helvetica', size=-(int(self._size.get() * 14 // 20))
         )
 
     def _init_menubar(self, parent):
         menubar = Menu(parent)
 
         filemenu = Menu(menubar, tearoff=0)
-        filemenu.add_command(label="Reset Application", underline=0, command=self.reset)
+        filemenu.add_command(label='Reset Application', underline=0, command=self.reset)
         filemenu.add_command(
-            label="Save Current Grammar",
+            label='Save Current Grammar',
             underline=0,
-            accelerator="Ctrl-s",
+            accelerator='Ctrl-s',
             command=self.save_grammar,
         )
         filemenu.add_command(
-            label="Load Grammar",
+            label='Load Grammar',
             underline=0,
-            accelerator="Ctrl-o",
+            accelerator='Ctrl-o',
             command=self.load_grammar,
         )
 
         filemenu.add_command(
-            label="Save Grammar History", underline=13, command=self.save_history
+            label='Save Grammar History', underline=13, command=self.save_history
         )
 
         filemenu.add_command(
-            label="Exit", underline=1, command=self.destroy, accelerator="Ctrl-q"
+            label='Exit', underline=1, command=self.destroy, accelerator='Ctrl-q'
         )
-        menubar.add_cascade(label="File", underline=0, menu=filemenu)
+        menubar.add_cascade(label='File', underline=0, menu=filemenu)
 
         viewmenu = Menu(menubar, tearoff=0)
         viewmenu.add_radiobutton(
-            label="Tiny",
+            label='Tiny',
             variable=self._size,
             underline=0,
             value=10,
             command=self.resize,
         )
         viewmenu.add_radiobutton(
-            label="Small",
+            label='Small',
             variable=self._size,
             underline=0,
             value=16,
             command=self.resize,
         )
         viewmenu.add_radiobutton(
-            label="Medium",
+            label='Medium',
             variable=self._size,
             underline=0,
             value=20,
             command=self.resize,
         )
         viewmenu.add_radiobutton(
-            label="Large",
+            label='Large',
             variable=self._size,
             underline=0,
             value=24,
             command=self.resize,
         )
         viewmenu.add_radiobutton(
-            label="Huge",
+            label='Huge',
             variable=self._size,
             underline=0,
             value=34,
             command=self.resize,
         )
-        menubar.add_cascade(label="View", underline=0, menu=viewmenu)
+        menubar.add_cascade(label='View', underline=0, menu=viewmenu)
 
         devsetmenu = Menu(menubar, tearoff=0)
         devsetmenu.add_radiobutton(
-            label="50 sentences",
+            label='50 sentences',
             variable=self._devset_size,
             value=50,
             command=self.set_devset_size,
         )
         devsetmenu.add_radiobutton(
-            label="100 sentences",
+            label='100 sentences',
             variable=self._devset_size,
             value=100,
             command=self.set_devset_size,
         )
         devsetmenu.add_radiobutton(
-            label="200 sentences",
+            label='200 sentences',
             variable=self._devset_size,
             value=200,
             command=self.set_devset_size,
         )
         devsetmenu.add_radiobutton(
-            label="500 sentences",
+            label='500 sentences',
             variable=self._devset_size,
             value=500,
             command=self.set_devset_size,
         )
-        menubar.add_cascade(label="Development-Set", underline=0, menu=devsetmenu)
+        menubar.add_cascade(label='Development-Set', underline=0, menu=devsetmenu)
 
         helpmenu = Menu(menubar, tearoff=0)
-        helpmenu.add_command(label="About", underline=0, command=self.about)
-        menubar.add_cascade(label="Help", underline=0, menu=helpmenu)
+        helpmenu.add_command(label='About', underline=0, command=self.about)
+        menubar.add_cascade(label='Help', underline=0, menu=helpmenu)
 
         parent.config(menu=menubar)
 
@@ -550,34 +551,34 @@ class RegexpChunkApp(object):
             self.show_devset()
         else:
             self.show_trace()
-        return "break"
+        return 'break'
 
     _SCALE_N = 5  # center on the last 5 examples.
     _DRAW_LINES = False
 
     def _eval_plot(self, *e, **config):
-        width = config.get("width", self.evalbox.winfo_width())
-        height = config.get("height", self.evalbox.winfo_height())
+        width = config.get('width', self.evalbox.winfo_width())
+        height = config.get('height', self.evalbox.winfo_height())
 
         # Clear the canvas
-        self.evalbox.delete("all")
+        self.evalbox.delete('all')
 
         # Draw the precision & recall labels.
         tag = self.evalbox.create_text(
-            10, height // 2 - 10, justify="left", anchor="w", text="Precision"
+            10, height // 2 - 10, justify='left', anchor='w', text='Precision'
         )
         left, right = self.evalbox.bbox(tag)[2] + 5, width - 10
         tag = self.evalbox.create_text(
             left + (width - left) // 2,
             height - 10,
-            anchor="s",
-            text="Recall",
-            justify="center",
+            anchor='s',
+            text='Recall',
+            justify='center',
         )
         top, bot = 10, self.evalbox.bbox(tag)[1] - 10
 
         # Draw masks for clipping the plot.
-        bg = self._EVALBOX_PARAMS["background"]
+        bg = self._EVALBOX_PARAMS['background']
         self.evalbox.lower(
             self.evalbox.create_rectangle(0, 0, left - 1, 5000, fill=bg, outline=bg)
         )
@@ -624,9 +625,9 @@ class RegexpChunkApp(object):
                 (i / 10.0 - min_precision) / (max_precision - min_precision)
             )
             if left < x < right:
-                self.evalbox.create_line(x, top, x, bot, fill="#888")
+                self.evalbox.create_line(x, top, x, bot, fill='#888')
             if top < y < bot:
-                self.evalbox.create_line(left, y, right, y, fill="#888")
+                self.evalbox.create_line(left, y, right, y, fill='#888')
         self.evalbox.create_line(left, top, left, bot)
         self.evalbox.create_line(left, bot, right, bot)
 
@@ -634,30 +635,30 @@ class RegexpChunkApp(object):
         self.evalbox.create_text(
             left - 3,
             bot,
-            justify="right",
-            anchor="se",
-            text="%d%%" % (100 * min_precision),
+            justify='right',
+            anchor='se',
+            text='%d%%' % (100 * min_precision),
         )
         self.evalbox.create_text(
             left - 3,
             top,
-            justify="right",
-            anchor="ne",
-            text="%d%%" % (100 * max_precision),
+            justify='right',
+            anchor='ne',
+            text='%d%%' % (100 * max_precision),
         )
         self.evalbox.create_text(
             left,
             bot + 3,
-            justify="center",
-            anchor="nw",
-            text="%d%%" % (100 * min_recall),
+            justify='center',
+            anchor='nw',
+            text='%d%%' % (100 * min_recall),
         )
         self.evalbox.create_text(
             right,
             bot + 3,
-            justify="center",
-            anchor="ne",
-            text="%d%%" % (100 * max_recall),
+            justify='center',
+            anchor='ne',
+            text='%d%%' % (100 * max_recall),
         )
 
         # Display the scores.
@@ -671,22 +672,22 @@ class RegexpChunkApp(object):
             )
             if i == self._history_index:
                 self.evalbox.create_oval(
-                    x - 2, y - 2, x + 2, y + 2, fill="#0f0", outline="#000"
+                    x - 2, y - 2, x + 2, y + 2, fill='#0f0', outline='#000'
                 )
-                self.status["text"] = (
-                    "Precision: %.2f%%\t" % (precision * 100)
-                    + "Recall: %.2f%%\t" % (recall * 100)
-                    + "F-score: %.2f%%" % (fscore * 100)
+                self.status['text'] = (
+                    'Precision: %.2f%%\t' % (precision * 100)
+                    + 'Recall: %.2f%%\t' % (recall * 100)
+                    + 'F-score: %.2f%%' % (fscore * 100)
                 )
             else:
                 self.evalbox.lower(
                     self.evalbox.create_oval(
-                        x - 2, y - 2, x + 2, y + 2, fill="#afa", outline="#8c8"
+                        x - 2, y - 2, x + 2, y + 2, fill='#afa', outline='#8c8'
                     )
                 )
             if prev_x is not None and self._eval_lines.get():
                 self.evalbox.lower(
-                    self.evalbox.create_line(prev_x, prev_y, x, y, fill="#8c8")
+                    self.evalbox.create_line(prev_x, prev_y, x, y, fill='#8c8')
                 )
             prev_x, prev_y = x, y
 
@@ -729,7 +730,7 @@ class RegexpChunkApp(object):
 
         # If the grammar is empty, the don't bother evaluating it, or
         # recording it in history -- the score will just be 0.
-        if self.normalized_grammar.strip() == "":
+        if self.normalized_grammar.strip() == '':
             # self._eval_index = self._devset_size.get()
             self._eval_demon_running = False
             return
@@ -762,7 +763,7 @@ class RegexpChunkApp(object):
             self._eval_normalized_grammar = None
         else:
             progress = 100 * self._eval_index / self._devset_size.get()
-            self.status["text"] = "Evaluating on Development Set (%d%%)" % progress
+            self.status['text'] = 'Evaluating on Development Set (%d%%)' % progress
             self._eval_demon_running = True
             self._adaptively_modify_eval_chunk(time.time() - t0)
             self.top.after(int(self._EVAL_FREQ * 1000), self._eval_demon)
@@ -803,210 +804,210 @@ class RegexpChunkApp(object):
         self.grammarlabel = Label(
             frame0,
             font=self._font,
-            text="Grammar:",
-            highlightcolor="black",
-            background=self._GRAMMARBOX_PARAMS["background"],
+            text='Grammar:',
+            highlightcolor='black',
+            background=self._GRAMMARBOX_PARAMS['background'],
         )
-        self.grammarlabel.grid(column=0, row=0, sticky="SW")
-        self.grammarbox.grid(column=0, row=1, sticky="NEWS")
+        self.grammarlabel.grid(column=0, row=0, sticky='SW')
+        self.grammarbox.grid(column=0, row=1, sticky='NEWS')
 
         # Scroll bar for grammar
         grammar_scrollbar = Scrollbar(frame0, command=self.grammarbox.yview)
-        grammar_scrollbar.grid(column=1, row=1, sticky="NWS")
+        grammar_scrollbar.grid(column=1, row=1, sticky='NWS')
         self.grammarbox.config(yscrollcommand=grammar_scrollbar.set)
 
         # grammar buttons
-        bg = self._FRAME_PARAMS["background"]
+        bg = self._FRAME_PARAMS['background']
         frame3 = Frame(frame0, background=bg)
-        frame3.grid(column=0, row=2, sticky="EW")
+        frame3.grid(column=0, row=2, sticky='EW')
         Button(
             frame3,
-            text="Prev Grammar",
+            text='Prev Grammar',
             command=self._history_prev,
             **self._BUTTON_PARAMS
-        ).pack(side="left")
+        ).pack(side='left')
         Button(
             frame3,
-            text="Next Grammar",
+            text='Next Grammar',
             command=self._history_next,
             **self._BUTTON_PARAMS
-        ).pack(side="left")
+        ).pack(side='left')
 
         # Help box
         self.helpbox = Text(frame0, font=self._smallfont, **self._HELPBOX_PARAMS)
-        self.helpbox.grid(column=3, row=1, sticky="NEWS")
+        self.helpbox.grid(column=3, row=1, sticky='NEWS')
         self.helptabs = {}
-        bg = self._FRAME_PARAMS["background"]
+        bg = self._FRAME_PARAMS['background']
         helptab_frame = Frame(frame0, background=bg)
-        helptab_frame.grid(column=3, row=0, sticky="SW")
+        helptab_frame.grid(column=3, row=0, sticky='SW')
         for i, (tab, tabstops, text) in enumerate(self.HELP):
             label = Label(helptab_frame, text=tab, font=self._smallfont)
-            label.grid(column=i * 2, row=0, sticky="S")
+            label.grid(column=i * 2, row=0, sticky='S')
             # help_frame.grid_columnconfigure(i, weight=1)
             # label.pack(side='left')
-            label.bind("<ButtonPress>", lambda e, tab=tab: self.show_help(tab))
+            label.bind('<ButtonPress>', lambda e, tab=tab: self.show_help(tab))
             self.helptabs[tab] = label
             Frame(
                 helptab_frame, height=1, width=self._HELPTAB_SPACER, background=bg
             ).grid(column=i * 2 + 1, row=0)
         self.helptabs[self.HELP[0][0]].configure(font=self._font)
-        self.helpbox.tag_config("elide", elide=True)
+        self.helpbox.tag_config('elide', elide=True)
         for (tag, params) in self.HELP_AUTOTAG:
-            self.helpbox.tag_config("tag-%s" % tag, **params)
+            self.helpbox.tag_config('tag-%s' % tag, **params)
         self.show_help(self.HELP[0][0])
 
         # Scroll bar for helpbox
         help_scrollbar = Scrollbar(frame0, command=self.helpbox.yview)
         self.helpbox.config(yscrollcommand=help_scrollbar.set)
-        help_scrollbar.grid(column=4, row=1, sticky="NWS")
+        help_scrollbar.grid(column=4, row=1, sticky='NWS')
 
         # The dev set
-        frame4 = Frame(frame0, background=self._FRAME_PARAMS["background"])
+        frame4 = Frame(frame0, background=self._FRAME_PARAMS['background'])
         self.devsetbox = Text(frame4, font=self._font, **self._DEVSETBOX_PARAMS)
-        self.devsetbox.pack(expand=True, fill="both")
+        self.devsetbox.pack(expand=True, fill='both')
         self.devsetlabel = Label(
             frame0,
             font=self._font,
-            text="Development Set:",
-            justify="right",
-            background=self._DEVSETBOX_PARAMS["background"],
+            text='Development Set:',
+            justify='right',
+            background=self._DEVSETBOX_PARAMS['background'],
         )
-        self.devsetlabel.grid(column=0, row=4, sticky="SW")
-        frame4.grid(column=0, row=5, sticky="NEWS")
+        self.devsetlabel.grid(column=0, row=4, sticky='SW')
+        frame4.grid(column=0, row=5, sticky='NEWS')
 
         # dev set scrollbars
         self.devset_scroll = Scrollbar(frame0, command=self._devset_scroll)
-        self.devset_scroll.grid(column=1, row=5, sticky="NWS")
+        self.devset_scroll.grid(column=1, row=5, sticky='NWS')
         self.devset_xscroll = Scrollbar(
-            frame4, command=self.devsetbox.xview, orient="horiz"
+            frame4, command=self.devsetbox.xview, orient='horiz'
         )
-        self.devsetbox["xscrollcommand"] = self.devset_xscroll.set
-        self.devset_xscroll.pack(side="bottom", fill="x")
+        self.devsetbox['xscrollcommand'] = self.devset_xscroll.set
+        self.devset_xscroll.pack(side='bottom', fill='x')
 
         # dev set buttons
-        bg = self._FRAME_PARAMS["background"]
+        bg = self._FRAME_PARAMS['background']
         frame1 = Frame(frame0, background=bg)
-        frame1.grid(column=0, row=7, sticky="EW")
+        frame1.grid(column=0, row=7, sticky='EW')
         Button(
             frame1,
-            text="Prev Example (Ctrl-p)",
+            text='Prev Example (Ctrl-p)',
             command=self._devset_prev,
             **self._BUTTON_PARAMS
-        ).pack(side="left")
+        ).pack(side='left')
         Button(
             frame1,
-            text="Next Example (Ctrl-n)",
+            text='Next Example (Ctrl-n)',
             command=self._devset_next,
             **self._BUTTON_PARAMS
-        ).pack(side="left")
+        ).pack(side='left')
         self.devset_button = Button(
             frame1,
-            text="Show example",
+            text='Show example',
             command=self.show_devset,
-            state="disabled",
+            state='disabled',
             **self._BUTTON_PARAMS
         )
-        self.devset_button.pack(side="right")
+        self.devset_button.pack(side='right')
         self.trace_button = Button(
-            frame1, text="Show trace", command=self.show_trace, **self._BUTTON_PARAMS
+            frame1, text='Show trace', command=self.show_trace, **self._BUTTON_PARAMS
         )
-        self.trace_button.pack(side="right")
+        self.trace_button.pack(side='right')
 
         # evaluation box
         self.evalbox = Canvas(frame0, **self._EVALBOX_PARAMS)
         label = Label(
             frame0,
             font=self._font,
-            text="Evaluation:",
-            justify="right",
-            background=self._EVALBOX_PARAMS["background"],
+            text='Evaluation:',
+            justify='right',
+            background=self._EVALBOX_PARAMS['background'],
         )
-        label.grid(column=3, row=4, sticky="SW")
-        self.evalbox.grid(column=3, row=5, sticky="NEWS", columnspan=2)
+        label.grid(column=3, row=4, sticky='SW')
+        self.evalbox.grid(column=3, row=5, sticky='NEWS', columnspan=2)
 
         # evaluation box buttons
-        bg = self._FRAME_PARAMS["background"]
+        bg = self._FRAME_PARAMS['background']
         frame2 = Frame(frame0, background=bg)
-        frame2.grid(column=3, row=7, sticky="EW")
+        frame2.grid(column=3, row=7, sticky='EW')
         self._autoscale = IntVar(self.top)
         self._autoscale.set(False)
         Checkbutton(
             frame2,
             variable=self._autoscale,
             command=self._eval_plot,
-            text="Zoom",
+            text='Zoom',
             **self._BUTTON_PARAMS
-        ).pack(side="left")
+        ).pack(side='left')
         self._eval_lines = IntVar(self.top)
         self._eval_lines.set(False)
         Checkbutton(
             frame2,
             variable=self._eval_lines,
             command=self._eval_plot,
-            text="Lines",
+            text='Lines',
             **self._BUTTON_PARAMS
-        ).pack(side="left")
-        Button(frame2, text="History", **self._BUTTON_PARAMS).pack(side="right")
+        ).pack(side='left')
+        Button(frame2, text='History', **self._BUTTON_PARAMS).pack(side='right')
 
         # The status label
         self.status = Label(frame0, font=self._font, **self._STATUS_PARAMS)
-        self.status.grid(column=0, row=9, sticky="NEW", padx=3, pady=2, columnspan=5)
+        self.status.grid(column=0, row=9, sticky='NEW', padx=3, pady=2, columnspan=5)
 
         # Help box & devset box can't be edited.
-        self.helpbox["state"] = "disabled"
-        self.devsetbox["state"] = "disabled"
+        self.helpbox['state'] = 'disabled'
+        self.devsetbox['state'] = 'disabled'
 
         # Spacers
-        bg = self._FRAME_PARAMS["background"]
+        bg = self._FRAME_PARAMS['background']
         Frame(frame0, height=10, width=0, background=bg).grid(column=0, row=3)
         Frame(frame0, height=0, width=10, background=bg).grid(column=2, row=0)
         Frame(frame0, height=6, width=0, background=bg).grid(column=0, row=8)
 
         # pack the frame.
-        frame0.pack(fill="both", expand=True)
+        frame0.pack(fill='both', expand=True)
 
         # Set up colors for the devset box
-        self.devsetbox.tag_config("true-pos", background="#afa", underline="True")
-        self.devsetbox.tag_config("false-neg", underline="True", foreground="#800")
-        self.devsetbox.tag_config("false-pos", background="#faa")
-        self.devsetbox.tag_config("trace", foreground="#666", wrap="none")
-        self.devsetbox.tag_config("wrapindent", lmargin2=30, wrap="none")
-        self.devsetbox.tag_config("error", foreground="#800")
+        self.devsetbox.tag_config('true-pos', background='#afa', underline='True')
+        self.devsetbox.tag_config('false-neg', underline='True', foreground='#800')
+        self.devsetbox.tag_config('false-pos', background='#faa')
+        self.devsetbox.tag_config('trace', foreground='#666', wrap='none')
+        self.devsetbox.tag_config('wrapindent', lmargin2=30, wrap='none')
+        self.devsetbox.tag_config('error', foreground='#800')
 
         # And for the grammarbox
-        self.grammarbox.tag_config("error", background="#fec")
-        self.grammarbox.tag_config("comment", foreground="#840")
-        self.grammarbox.tag_config("angle", foreground="#00f")
-        self.grammarbox.tag_config("brace", foreground="#0a0")
-        self.grammarbox.tag_config("hangindent", lmargin1=0, lmargin2=40)
+        self.grammarbox.tag_config('error', background='#fec')
+        self.grammarbox.tag_config('comment', foreground='#840')
+        self.grammarbox.tag_config('angle', foreground='#00f')
+        self.grammarbox.tag_config('brace', foreground='#0a0')
+        self.grammarbox.tag_config('hangindent', lmargin1=0, lmargin2=40)
 
     _showing_trace = False
 
     def show_trace(self, *e):
         self._showing_trace = True
-        self.trace_button["state"] = "disabled"
-        self.devset_button["state"] = "normal"
+        self.trace_button['state'] = 'disabled'
+        self.devset_button['state'] = 'normal'
 
-        self.devsetbox["state"] = "normal"
+        self.devsetbox['state'] = 'normal'
         # self.devsetbox['wrap'] = 'none'
-        self.devsetbox.delete("1.0", "end")
-        self.devsetlabel["text"] = "Development Set (%d/%d)" % (
+        self.devsetbox.delete('1.0', 'end')
+        self.devsetlabel['text'] = 'Development Set (%d/%d)' % (
             (self.devset_index + 1, self._devset_size.get())
         )
 
         if self.chunker is None:
-            self.devsetbox.insert("1.0", "Trace: waiting for a valid grammar.")
-            self.devsetbox.tag_add("error", "1.0", "end")
+            self.devsetbox.insert('1.0', 'Trace: waiting for a valid grammar.')
+            self.devsetbox.tag_add('error', '1.0', 'end')
             return  # can't do anything more
 
         gold_tree = self.devset[self.devset_index]
         rules = self.chunker.rules()
 
         # Calculate the tag sequence
-        tagseq = "\t"
+        tagseq = '\t'
         charnum = [1]
         for wordnum, (word, pos) in enumerate(gold_tree.leaves()):
-            tagseq += "%s " % pos
+            tagseq += '%s ' % pos
             charnum.append(len(tagseq))
         self.charnum = dict(
             ((i, j), charnum[j])
@@ -1017,14 +1018,14 @@ class RegexpChunkApp(object):
 
         for i in range(len(rules) + 1):
             if i == 0:
-                self.devsetbox.insert("end", "Start:\n")
-                self.devsetbox.tag_add("trace", "end -2c linestart", "end -2c")
+                self.devsetbox.insert('end', 'Start:\n')
+                self.devsetbox.tag_add('trace', 'end -2c linestart', 'end -2c')
             else:
-                self.devsetbox.insert("end", "Apply %s:\n" % rules[i - 1])
-                self.devsetbox.tag_add("trace", "end -2c linestart", "end -2c")
+                self.devsetbox.insert('end', 'Apply %s:\n' % rules[i - 1])
+                self.devsetbox.tag_add('trace', 'end -2c linestart', 'end -2c')
             # Display the tag sequence.
-            self.devsetbox.insert("end", tagseq + "\n")
-            self.devsetbox.tag_add("wrapindent", "end -2c linestart", "end -2c")
+            self.devsetbox.insert('end', tagseq + '\n')
+            self.devsetbox.tag_add('wrapindent', 'end -2c linestart', 'end -2c')
             # Run a partial parser, and extract gold & test chunks
             chunker = RegexpChunkParser(rules[:i])
             test_tree = self._chunkparse(gold_tree.leaves())
@@ -1032,13 +1033,13 @@ class RegexpChunkApp(object):
             test_chunks = self._chunks(test_tree)
             # Compare them.
             for chunk in gold_chunks.intersection(test_chunks):
-                self._color_chunk(i, chunk, "true-pos")
+                self._color_chunk(i, chunk, 'true-pos')
             for chunk in gold_chunks - test_chunks:
-                self._color_chunk(i, chunk, "false-neg")
+                self._color_chunk(i, chunk, 'false-neg')
             for chunk in test_chunks - gold_chunks:
-                self._color_chunk(i, chunk, "false-pos")
-        self.devsetbox.insert("end", "Finished.\n")
-        self.devsetbox.tag_add("trace", "end -2c linestart", "end -2c")
+                self._color_chunk(i, chunk, 'false-pos')
+        self.devsetbox.insert('end', 'Finished.\n')
+        self.devsetbox.tag_add('trace', 'end -2c linestart', 'end -2c')
 
         # This is a hack, because the x-scrollbar isn't updating its
         # position right -- I'm not sure what the underlying cause is
@@ -1046,18 +1047,18 @@ class RegexpChunkApp(object):
         self.top.after(100, self.devset_xscroll.set, 0, 0.3)
 
     def show_help(self, tab):
-        self.helpbox["state"] = "normal"
-        self.helpbox.delete("1.0", "end")
+        self.helpbox['state'] = 'normal'
+        self.helpbox.delete('1.0', 'end')
         for (name, tabstops, text) in self.HELP:
             if name == tab:
                 text = text.replace(
-                    "<<TAGSET>>",
-                    "\n".join(
+                    '<<TAGSET>>',
+                    '\n'.join(
                         (
-                            "\t%s\t%s" % item
+                            '\t%s\t%s' % item
                             for item in sorted(
                                 list(self.tagset.items()),
-                                key=lambda t_w: re.match("\w+", t_w[0])
+                                key=lambda t_w: re.match('\w+', t_w[0])
                                 and (0, t_w[0])
                                 or (1, t_w[0]),
                             )
@@ -1067,27 +1068,27 @@ class RegexpChunkApp(object):
 
                 self.helptabs[name].config(**self._HELPTAB_FG_PARAMS)
                 self.helpbox.config(tabs=tabstops)
-                self.helpbox.insert("1.0", text + "\n" * 20)
-                C = "1.0 + %d chars"
+                self.helpbox.insert('1.0', text + '\n' * 20)
+                C = '1.0 + %d chars'
                 for (tag, params) in self.HELP_AUTOTAG:
-                    pattern = "(?s)(<%s>)(.*?)(</%s>)" % (tag, tag)
+                    pattern = '(?s)(<%s>)(.*?)(</%s>)' % (tag, tag)
                     for m in re.finditer(pattern, text):
-                        self.helpbox.tag_add("elide", C % m.start(1), C % m.end(1))
+                        self.helpbox.tag_add('elide', C % m.start(1), C % m.end(1))
                         self.helpbox.tag_add(
-                            "tag-%s" % tag, C % m.start(2), C % m.end(2)
+                            'tag-%s' % tag, C % m.start(2), C % m.end(2)
                         )
-                        self.helpbox.tag_add("elide", C % m.start(3), C % m.end(3))
+                        self.helpbox.tag_add('elide', C % m.start(3), C % m.end(3))
             else:
                 self.helptabs[name].config(**self._HELPTAB_BG_PARAMS)
-        self.helpbox["state"] = "disabled"
+        self.helpbox['state'] = 'disabled'
 
     def _history_prev(self, *e):
         self._view_history(self._history_index - 1)
-        return "break"
+        return 'break'
 
     def _history_next(self, *e):
         self._view_history(self._history_index + 1)
-        return "break"
+        return 'break'
 
     def _view_history(self, index):
         # Bounds & sanity checking:
@@ -1099,10 +1100,10 @@ class RegexpChunkApp(object):
             return
         # Show the requested grammar.  It will get added to _history
         # only if they edit it (causing self.update() to get run.)
-        self.grammarbox["state"] = "normal"
-        self.grammarbox.delete("1.0", "end")
-        self.grammarbox.insert("end", self._history[index][0])
-        self.grammarbox.mark_set("insert", "1.0")
+        self.grammarbox['state'] = 'normal'
+        self.grammarbox.delete('1.0', 'end')
+        self.grammarbox.insert('end', self._history[index][0])
+        self.grammarbox.mark_set('insert', '1.0')
         self._history_index = index
         self._syntax_highlight_grammar(self._history[index][0])
         # Record the normalized grammar & regenerate the chunker.
@@ -1110,7 +1111,7 @@ class RegexpChunkApp(object):
         if self.normalized_grammar:
             rules = [
                 RegexpChunkRule.fromstring(line)
-                for line in self.normalized_grammar.split("\n")
+                for line in self.normalized_grammar.split('\n')
             ]
         else:
             rules = []
@@ -1123,20 +1124,20 @@ class RegexpChunkApp(object):
             self.show_trace()
         # Update the grammar label
         if self._history_index < len(self._history) - 1:
-            self.grammarlabel["text"] = "Grammar %s/%s:" % (
+            self.grammarlabel['text'] = 'Grammar %s/%s:' % (
                 self._history_index + 1,
                 len(self._history),
             )
         else:
-            self.grammarlabel["text"] = "Grammar:"
+            self.grammarlabel['text'] = 'Grammar:'
 
     def _devset_next(self, *e):
-        self._devset_scroll("scroll", 1, "page")
-        return "break"
+        self._devset_scroll('scroll', 1, 'page')
+        return 'break'
 
     def _devset_prev(self, *e):
-        self._devset_scroll("scroll", -1, "page")
-        return "break"
+        self._devset_scroll('scroll', -1, 'page')
+        return 'break'
 
     def destroy(self, *e):
         if self.top is None:
@@ -1147,14 +1148,14 @@ class RegexpChunkApp(object):
     def _devset_scroll(self, command, *args):
         N = 1  # size of a page -- one sentence.
         showing_trace = self._showing_trace
-        if command == "scroll" and args[1].startswith("unit"):
+        if command == 'scroll' and args[1].startswith('unit'):
             self.show_devset(self.devset_index + int(args[0]))
-        elif command == "scroll" and args[1].startswith("page"):
+        elif command == 'scroll' and args[1].startswith('page'):
             self.show_devset(self.devset_index + N * int(args[0]))
-        elif command == "moveto":
+        elif command == 'moveto':
             self.show_devset(int(float(args[0]) * self._devset_size.get()))
         else:
-            assert 0, "bad scroll command %s %s" % (command, args)
+            assert 0, 'bad scroll command %s %s' % (command, args)
         if showing_trace:
             self.show_trace()
 
@@ -1170,14 +1171,14 @@ class RegexpChunkApp(object):
         self.devset_index = index
 
         self._showing_trace = False
-        self.trace_button["state"] = "normal"
-        self.devset_button["state"] = "disabled"
+        self.trace_button['state'] = 'normal'
+        self.devset_button['state'] = 'disabled'
 
         # Clear the text box.
-        self.devsetbox["state"] = "normal"
-        self.devsetbox["wrap"] = "word"
-        self.devsetbox.delete("1.0", "end")
-        self.devsetlabel["text"] = "Development Set (%d/%d)" % (
+        self.devsetbox['state'] = 'normal'
+        self.devsetbox['wrap'] = 'word'
+        self.devsetbox.delete('1.0', 'end')
+        self.devsetlabel['text'] = 'Development Set (%d/%d)' % (
             (self.devset_index + 1, self._devset_size.get())
         )
 
@@ -1186,17 +1187,17 @@ class RegexpChunkApp(object):
         self.charnum = {}
         self.linenum = {0: 1}
         for sentnum, sent in enumerate(sample):
-            linestr = ""
+            linestr = ''
             for wordnum, (word, pos) in enumerate(sent.leaves()):
                 self.charnum[sentnum, wordnum] = len(linestr)
-                linestr += "%s/%s " % (word, pos)
+                linestr += '%s/%s ' % (word, pos)
                 self.charnum[sentnum, wordnum + 1] = len(linestr)
-            self.devsetbox.insert("end", linestr[:-1] + "\n\n")
+            self.devsetbox.insert('end', linestr[:-1] + '\n\n')
 
         # Highlight chunks in the dev set
         if self.chunker is not None:
             self._highlight_devset()
-        self.devsetbox["state"] = "disabled"
+        self.devsetbox['state'] = 'disabled'
 
         # Update the scrollbar
         first = self.devset_index / self._devset_size.get()
@@ -1218,46 +1219,46 @@ class RegexpChunkApp(object):
     def _syntax_highlight_grammar(self, grammar):
         if self.top is None:
             return
-        self.grammarbox.tag_remove("comment", "1.0", "end")
-        self.grammarbox.tag_remove("angle", "1.0", "end")
-        self.grammarbox.tag_remove("brace", "1.0", "end")
-        self.grammarbox.tag_add("hangindent", "1.0", "end")
-        for lineno, line in enumerate(grammar.split("\n")):
+        self.grammarbox.tag_remove('comment', '1.0', 'end')
+        self.grammarbox.tag_remove('angle', '1.0', 'end')
+        self.grammarbox.tag_remove('brace', '1.0', 'end')
+        self.grammarbox.tag_add('hangindent', '1.0', 'end')
+        for lineno, line in enumerate(grammar.split('\n')):
             if not line.strip():
                 continue
-            m = re.match(r"(\\.|[^#])*(#.*)?", line)
+            m = re.match(r'(\\.|[^#])*(#.*)?', line)
             comment_start = None
             if m.group(2):
                 comment_start = m.start(2)
-                s = "%d.%d" % (lineno + 1, m.start(2))
-                e = "%d.%d" % (lineno + 1, m.end(2))
-                self.grammarbox.tag_add("comment", s, e)
-            for m in re.finditer("[<>{}]", line):
+                s = '%d.%d' % (lineno + 1, m.start(2))
+                e = '%d.%d' % (lineno + 1, m.end(2))
+                self.grammarbox.tag_add('comment', s, e)
+            for m in re.finditer('[<>{}]', line):
                 if comment_start is not None and m.start() >= comment_start:
                     break
-                s = "%d.%d" % (lineno + 1, m.start())
-                e = "%d.%d" % (lineno + 1, m.end())
-                if m.group() in "<>":
-                    self.grammarbox.tag_add("angle", s, e)
+                s = '%d.%d' % (lineno + 1, m.start())
+                e = '%d.%d' % (lineno + 1, m.end())
+                if m.group() in '<>':
+                    self.grammarbox.tag_add('angle', s, e)
                 else:
-                    self.grammarbox.tag_add("brace", s, e)
+                    self.grammarbox.tag_add('brace', s, e)
 
     def _grammarcheck(self, grammar):
         if self.top is None:
             return
-        self.grammarbox.tag_remove("error", "1.0", "end")
+        self.grammarbox.tag_remove('error', '1.0', 'end')
         self._grammarcheck_errs = []
-        for lineno, line in enumerate(grammar.split("\n")):
-            line = re.sub(r"((\\.|[^#])*)(#.*)?", r"\1", line)
+        for lineno, line in enumerate(grammar.split('\n')):
+            line = re.sub(r'((\\.|[^#])*)(#.*)?', r'\1', line)
             line = line.strip()
             if line:
                 try:
                     RegexpChunkRule.fromstring(line)
                 except ValueError as e:
                     self.grammarbox.tag_add(
-                        "error", "%s.0" % (lineno + 1), "%s.0 lineend" % (lineno + 1)
+                        'error', '%s.0' % (lineno + 1), '%s.0 lineend' % (lineno + 1)
                     )
-        self.status["text"] = ""
+        self.status['text'] = ''
 
     def update(self, *event):
         # Record when update was called (for grammarcheck)
@@ -1265,7 +1266,7 @@ class RegexpChunkApp(object):
             self._last_keypress = time.time()
 
         # Read the grammar from the Text box.
-        self.grammar = grammar = self.grammarbox.get("1.0", "end")
+        self.grammar = grammar = self.grammarbox.get('1.0', 'end')
 
         # If the grammar hasn't changed, do nothing:
         normalized_grammar = self.normalize_grammar(grammar)
@@ -1277,7 +1278,7 @@ class RegexpChunkApp(object):
         # If the grammar has changed, and we're looking at history,
         # then stop looking at history.
         if self._history_index < len(self._history) - 1:
-            self.grammarlabel["text"] = "Grammar:"
+            self.grammarlabel['text'] = 'Grammar:'
 
         self._syntax_highlight_grammar(grammar)
 
@@ -1288,7 +1289,7 @@ class RegexpChunkApp(object):
             if normalized_grammar:
                 rules = [
                     RegexpChunkRule.fromstring(line)
-                    for line in normalized_grammar.split("\n")
+                    for line in normalized_grammar.split('\n')
                 ]
             else:
                 rules = []
@@ -1299,7 +1300,7 @@ class RegexpChunkApp(object):
             return
 
         self.chunker = RegexpChunkParser(rules)
-        self.grammarbox.tag_remove("error", "1.0", "end")
+        self.grammarbox.tag_remove('error', '1.0', 'end')
         self.grammar_changed = time.time()
         # Display the results
         if self._showing_trace:
@@ -1314,9 +1315,9 @@ class RegexpChunkApp(object):
         if sample is None:
             sample = self.devset[self.devset_index : self.devset_index + 1]
 
-        self.devsetbox.tag_remove("true-pos", "1.0", "end")
-        self.devsetbox.tag_remove("false-neg", "1.0", "end")
-        self.devsetbox.tag_remove("false-pos", "1.0", "end")
+        self.devsetbox.tag_remove('true-pos', '1.0', 'end')
+        self.devsetbox.tag_remove('false-neg', '1.0', 'end')
+        self.devsetbox.tag_remove('false-pos', '1.0', 'end')
 
         # Run the grammar on the test cases.
         for sentnum, gold_tree in enumerate(sample):
@@ -1327,11 +1328,11 @@ class RegexpChunkApp(object):
             test_chunks = self._chunks(test_tree)
             # Compare them.
             for chunk in gold_chunks.intersection(test_chunks):
-                self._color_chunk(sentnum, chunk, "true-pos")
+                self._color_chunk(sentnum, chunk, 'true-pos')
             for chunk in gold_chunks - test_chunks:
-                self._color_chunk(sentnum, chunk, "false-neg")
+                self._color_chunk(sentnum, chunk, 'false-neg')
             for chunk in test_chunks - gold_chunks:
-                self._color_chunk(sentnum, chunk, "false-pos")
+                self._color_chunk(sentnum, chunk, 'false-pos')
 
     def _chunkparse(self, words):
         try:
@@ -1340,7 +1341,7 @@ class RegexpChunkApp(object):
             # There's an error somewhere in the grammar, but we're not sure
             # exactly where, so just mark the whole grammar as bad.
             # E.g., this is caused by: "({<NN>})"
-            self.grammarbox.tag_add("error", "1.0", "end")
+            self.grammarbox.tag_add('error', '1.0', 'end')
             # Treat it as tagging nothing:
             return words
 
@@ -1348,8 +1349,8 @@ class RegexpChunkApp(object):
         start, end = chunk
         self.devsetbox.tag_add(
             tag,
-            "%s.%s" % (self.linenum[sentnum], self.charnum[sentnum, start]),
-            "%s.%s" % (self.linenum[sentnum], self.charnum[sentnum, end] - 1),
+            '%s.%s' % (self.linenum[sentnum], self.charnum[sentnum, start]),
+            '%s.%s' % (self.linenum[sentnum], self.charnum[sentnum, end] - 1),
         )
 
     def reset(self):
@@ -1361,40 +1362,40 @@ class RegexpChunkApp(object):
         self._history = []
         self._history_index = 0
         # Update the on-screen display.
-        self.grammarbox.delete("1.0", "end")
+        self.grammarbox.delete('1.0', 'end')
         self.show_devset(0)
         self.update()
         # self._eval_plot()
 
     SAVE_GRAMMAR_TEMPLATE = (
-        "# Regexp Chunk Parsing Grammar\n"
-        "# Saved %(date)s\n"
-        "#\n"
-        "# Development set: %(devset)s\n"
-        "#   Precision: %(precision)s\n"
-        "#   Recall:    %(recall)s\n"
-        "#   F-score:   %(fscore)s\n\n"
-        "%(grammar)s\n"
+        '# Regexp Chunk Parsing Grammar\n'
+        '# Saved %(date)s\n'
+        '#\n'
+        '# Development set: %(devset)s\n'
+        '#   Precision: %(precision)s\n'
+        '#   Recall:    %(recall)s\n'
+        '#   F-score:   %(fscore)s\n\n'
+        '%(grammar)s\n'
     )
 
     def save_grammar(self, filename=None):
         if not filename:
-            ftypes = [("Chunk Gramamr", ".chunk"), ("All files", "*")]
-            filename = asksaveasfilename(filetypes=ftypes, defaultextension=".chunk")
+            ftypes = [('Chunk Gramamr', '.chunk'), ('All files', '*')]
+            filename = asksaveasfilename(filetypes=ftypes, defaultextension='.chunk')
             if not filename:
                 return
         if self._history and self.normalized_grammar == self.normalize_grammar(
             self._history[-1][0]
         ):
             precision, recall, fscore = [
-                "%.2f%%" % (100 * v) for v in self._history[-1][1:]
+                '%.2f%%' % (100 * v) for v in self._history[-1][1:]
             ]
         elif self.chunker is None:
-            precision = recall = fscore = "Grammar not well formed"
+            precision = recall = fscore = 'Grammar not well formed'
         else:
-            precision = recall = fscore = "Not finished evaluation yet"
+            precision = recall = fscore = 'Not finished evaluation yet'
 
-        with open(filename, "w") as outfile:
+        with open(filename, 'w') as outfile:
             outfile.write(
                 self.SAVE_GRAMMAR_TEMPLATE
                 % dict(
@@ -1409,39 +1410,39 @@ class RegexpChunkApp(object):
 
     def load_grammar(self, filename=None):
         if not filename:
-            ftypes = [("Chunk Gramamr", ".chunk"), ("All files", "*")]
-            filename = askopenfilename(filetypes=ftypes, defaultextension=".chunk")
+            ftypes = [('Chunk Gramamr', '.chunk'), ('All files', '*')]
+            filename = askopenfilename(filetypes=ftypes, defaultextension='.chunk')
             if not filename:
                 return
-        self.grammarbox.delete("1.0", "end")
+        self.grammarbox.delete('1.0', 'end')
         self.update()
-        with open(filename, "r") as infile:
+        with open(filename, 'r') as infile:
             grammar = infile.read()
         grammar = re.sub(
-            "^\# Regexp Chunk Parsing Grammar[\s\S]*" "F-score:.*\n", "", grammar
+            '^\# Regexp Chunk Parsing Grammar[\s\S]*' 'F-score:.*\n', '', grammar
         ).lstrip()
-        self.grammarbox.insert("1.0", grammar)
+        self.grammarbox.insert('1.0', grammar)
         self.update()
 
     def save_history(self, filename=None):
         if not filename:
-            ftypes = [("Chunk Gramamr History", ".txt"), ("All files", "*")]
-            filename = asksaveasfilename(filetypes=ftypes, defaultextension=".txt")
+            ftypes = [('Chunk Gramamr History', '.txt'), ('All files', '*')]
+            filename = asksaveasfilename(filetypes=ftypes, defaultextension='.txt')
             if not filename:
                 return
 
-        with open(filename, "w") as outfile:
-            outfile.write("# Regexp Chunk Parsing Grammar History\n")
-            outfile.write("# Saved %s\n" % time.ctime())
-            outfile.write("# Development set: %s\n" % self.devset_name)
+        with open(filename, 'w') as outfile:
+            outfile.write('# Regexp Chunk Parsing Grammar History\n')
+            outfile.write('# Saved %s\n' % time.ctime())
+            outfile.write('# Development set: %s\n' % self.devset_name)
             for i, (g, p, r, f) in enumerate(self._history):
                 hdr = (
-                    "Grammar %d/%d (precision=%.2f%%, recall=%.2f%%, "
-                    "fscore=%.2f%%)"
+                    'Grammar %d/%d (precision=%.2f%%, recall=%.2f%%, '
+                    'fscore=%.2f%%)'
                     % (i + 1, len(self._history), p * 100, r * 100, f * 100)
                 )
-                outfile.write("\n%s\n" % hdr)
-                outfile.write("".join("  %s\n" % line for line in g.strip().split()))
+                outfile.write('\n%s\n' % hdr)
+                outfile.write(''.join('  %s\n' % line for line in g.strip().split()))
 
             if not (
                 self._history
@@ -1449,18 +1450,18 @@ class RegexpChunkApp(object):
                 == self.normalize_grammar(self._history[-1][0])
             ):
                 if self.chunker is None:
-                    outfile.write("\nCurrent Grammar (not well-formed)\n")
+                    outfile.write('\nCurrent Grammar (not well-formed)\n')
                 else:
-                    outfile.write("\nCurrent Grammar (not evaluated)\n")
+                    outfile.write('\nCurrent Grammar (not evaluated)\n')
                 outfile.write(
-                    "".join("  %s\n" % line for line in self.grammar.strip().split())
+                    ''.join('  %s\n' % line for line in self.grammar.strip().split())
                 )
 
     def about(self, *e):
         ABOUT = "NLTK RegExp Chunk Parser Application\n" + "Written by Edward Loper"
-        TITLE = "About: Regular Expression Chunk Parser Application"
+        TITLE = 'About: Regular Expression Chunk Parser Application'
         try:
-            from tkinter.messagebox import Message
+            from six.moves.tkinter_messagebox import Message
 
             Message(message=ABOUT, title=TITLE).show()
         except:
@@ -1497,7 +1498,7 @@ def app():
     RegexpChunkApp().mainloop()
 
 
-if __name__ == "__main__":
+if __name__ == '__main__':
     app()
 
-__all__ = ["app"]
+__all__ = ['app']
index 36362a8..b2165e9 100644 (file)
@@ -1,17 +1,19 @@
 # Natural Language Toolkit: Collocations Application
 # Much of the GUI code is imported from concordance.py; We intend to merge these tools together
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Sumukh Ghodke <sghodke@csse.unimelb.edu.au>
 # URL: <http://nltk.org/>
 # For license information, see LICENSE.TXT
 #
 
 
+from __future__ import division
+
 import threading
 
-import queue as q
-from tkinter.font import Font
-from tkinter import (
+from six.moves import queue as q
+from six.moves.tkinter_font import Font
+from six.moves.tkinter import (
     Button,
     END,
     Frame,
@@ -44,38 +46,38 @@ from nltk.util import in_idle
 from nltk.probability import FreqDist
 
 
-CORPUS_LOADED_EVENT = "<<CL_EVENT>>"
-ERROR_LOADING_CORPUS_EVENT = "<<ELC_EVENT>>"
+CORPUS_LOADED_EVENT = '<<CL_EVENT>>'
+ERROR_LOADING_CORPUS_EVENT = '<<ELC_EVENT>>'
 POLL_INTERVAL = 100
 
-_DEFAULT = "English: Brown Corpus (Humor)"
+_DEFAULT = 'English: Brown Corpus (Humor)'
 _CORPORA = {
-    "Catalan: CESS-CAT Corpus": lambda: cess_cat.words(),
-    "English: Brown Corpus": lambda: brown.words(),
-    "English: Brown Corpus (Press)": lambda: brown.words(
-        categories=["news", "editorial", "reviews"]
+    'Catalan: CESS-CAT Corpus': lambda: cess_cat.words(),
+    'English: Brown Corpus': lambda: brown.words(),
+    'English: Brown Corpus (Press)': lambda: brown.words(
+        categories=['news', 'editorial', 'reviews']
     ),
-    "English: Brown Corpus (Religion)": lambda: brown.words(categories="religion"),
-    "English: Brown Corpus (Learned)": lambda: brown.words(categories="learned"),
-    "English: Brown Corpus (Science Fiction)": lambda: brown.words(
-        categories="science_fiction"
+    'English: Brown Corpus (Religion)': lambda: brown.words(categories='religion'),
+    'English: Brown Corpus (Learned)': lambda: brown.words(categories='learned'),
+    'English: Brown Corpus (Science Fiction)': lambda: brown.words(
+        categories='science_fiction'
     ),
-    "English: Brown Corpus (Romance)": lambda: brown.words(categories="romance"),
-    "English: Brown Corpus (Humor)": lambda: brown.words(categories="humor"),
-    "English: NPS Chat Corpus": lambda: nps_chat.words(),
-    "English: Wall Street Journal Corpus": lambda: treebank.words(),
-    "Chinese: Sinica Corpus": lambda: sinica_treebank.words(),
-    "Dutch: Alpino Corpus": lambda: alpino.words(),
-    "Hindi: Indian Languages Corpus": lambda: indian.words(files="hindi.pos"),
-    "Portuguese: Floresta Corpus (Portugal)": lambda: floresta.words(),
-    "Portuguese: MAC-MORPHO Corpus (Brazil)": lambda: mac_morpho.words(),
-    "Portuguese: Machado Corpus (Brazil)": lambda: machado.words(),
-    "Spanish: CESS-ESP Corpus": lambda: cess_esp.words(),
+    'English: Brown Corpus (Romance)': lambda: brown.words(categories='romance'),
+    'English: Brown Corpus (Humor)': lambda: brown.words(categories='humor'),
+    'English: NPS Chat Corpus': lambda: nps_chat.words(),
+    'English: Wall Street Journal Corpus': lambda: treebank.words(),
+    'Chinese: Sinica Corpus': lambda: sinica_treebank.words(),
+    'Dutch: Alpino Corpus': lambda: alpino.words(),
+    'Hindi: Indian Languages Corpus': lambda: indian.words(files='hindi.pos'),
+    'Portuguese: Floresta Corpus (Portugal)': lambda: floresta.words(),
+    'Portuguese: MAC-MORPHO Corpus (Brazil)': lambda: mac_morpho.words(),
+    'Portuguese: Machado Corpus (Brazil)': lambda: machado.words(),
+    'Spanish: CESS-ESP Corpus': lambda: cess_esp.words(),
 }
 
 
 class CollocationsView:
-    _BACKGROUND_COLOUR = "#FFF"  # white
+    _BACKGROUND_COLOUR = '#FFF'  # white
 
     def __init__(self):
         self.queue = q.Queue()
@@ -88,10 +90,10 @@ class CollocationsView:
         self.after = self.top.after(POLL_INTERVAL, self._poll)
 
     def _init_top(self, top):
-        top.geometry("550x650+50+50")
-        top.title("NLTK Collocations List")
-        top.bind("<Control-q>", self.destroy)
-        top.protocol("WM_DELETE_WINDOW", self.destroy)
+        top.geometry('550x650+50+50')
+        top.title('NLTK Collocations List')
+        top.bind('<Control-q>', self.destroy)
+        top.protocol('WM_DELETE_WINDOW', self.destroy)
         top.minsize(550, 650)
 
     def _init_widgets(self, parent):
@@ -102,7 +104,7 @@ class CollocationsView:
         self._init_results_box(self.main_frame)
         self._init_paging(self.main_frame)
         self._init_status(self.main_frame)
-        self.main_frame.pack(fill="both", expand=True)
+        self.main_frame.pack(fill='both', expand=True)
 
     def _init_corpus_select(self, parent):
         innerframe = Frame(parent, background=self._BACKGROUND_COLOUR)
@@ -111,12 +113,12 @@ class CollocationsView:
         Label(
             innerframe,
             justify=LEFT,
-            text=" Corpus: ",
+            text=' Corpus: ',
             background=self._BACKGROUND_COLOUR,
             padx=2,
             pady=1,
             border=0,
-        ).pack(side="left")
+        ).pack(side='left')
 
         other_corpora = list(self.model.CORPORA.keys()).remove(
             self.model.DEFAULT_CORPUS
@@ -128,10 +130,10 @@ class CollocationsView:
             command=self.corpus_selected,
             *self.model.non_default_corpora()
         )
-        om["borderwidth"] = 0
-        om["highlightthickness"] = 1
-        om.pack(side="left")
-        innerframe.pack(side="top", fill="x", anchor="n")
+        om['borderwidth'] = 0
+        om['highlightthickness'] = 1
+        om.pack(side='left')
+        innerframe.pack(side='top', fill='x', anchor='n')
 
     def _init_status(self, parent):
         self.status = Label(
@@ -143,7 +145,7 @@ class CollocationsView:
             padx=1,
             pady=0,
         )
-        self.status.pack(side="top", anchor="sw")
+        self.status.pack(side='top', anchor='sw')
 
     def _init_menubar(self):
         self._result_size = IntVar(self.top)
@@ -151,37 +153,37 @@ class CollocationsView:
 
         filemenu = Menu(menubar, tearoff=0, borderwidth=0)
         filemenu.add_command(
-            label="Exit", underline=1, command=self.destroy, accelerator="Ctrl-q"
+            label='Exit', underline=1, command=self.destroy, accelerator='Ctrl-q'
         )
-        menubar.add_cascade(label="File", underline=0, menu=filemenu)
+        menubar.add_cascade(label='File', underline=0, menu=filemenu)
 
         editmenu = Menu(menubar, tearoff=0)
         rescntmenu = Menu(editmenu, tearoff=0)
         rescntmenu.add_radiobutton(
-            label="20",
+            label='20',
             variable=self._result_size,
             underline=0,
             value=20,
             command=self.set_result_size,
         )
         rescntmenu.add_radiobutton(
-            label="50",
+            label='50',
             variable=self._result_size,
             underline=0,
             value=50,
             command=self.set_result_size,
         )
         rescntmenu.add_radiobutton(
-            label="100",
+            label='100',
             variable=self._result_size,
             underline=0,
             value=100,
             command=self.set_result_size,
         )
         rescntmenu.invoke(1)
-        editmenu.add_cascade(label="Result Count", underline=0, menu=rescntmenu)
+        editmenu.add_cascade(label='Result Count', underline=0, menu=rescntmenu)
 
-        menubar.add_cascade(label="Edit", underline=0, menu=editmenu)
+        menubar.add_cascade(label='Edit', underline=0, menu=editmenu)
         self.top.config(menu=menubar)
 
     def set_result_size(self, **kwargs):
@@ -192,55 +194,55 @@ class CollocationsView:
         i1 = Frame(innerframe)
         i2 = Frame(innerframe)
         vscrollbar = Scrollbar(i1, borderwidth=1)
-        hscrollbar = Scrollbar(i2, borderwidth=1, orient="horiz")
+        hscrollbar = Scrollbar(i2, borderwidth=1, orient='horiz')
         self.results_box = Text(
             i1,
-            font=Font(family="courier", size="16"),
-            state="disabled",
+            font=Font(family='courier', size='16'),
+            state='disabled',
             borderwidth=1,
             yscrollcommand=vscrollbar.set,
             xscrollcommand=hscrollbar.set,
-            wrap="none",
-            width="40",
-            height="20",
+            wrap='none',
+            width='40',
+            height='20',
             exportselection=1,
         )
-        self.results_box.pack(side="left", fill="both", expand=True)
-        vscrollbar.pack(side="left", fill="y", anchor="e")
+        self.results_box.pack(side='left', fill='both', expand=True)
+        vscrollbar.pack(side='left', fill='y', anchor='e')
         vscrollbar.config(command=self.results_box.yview)
-        hscrollbar.pack(side="left", fill="x", expand=True, anchor="w")
+        hscrollbar.pack(side='left', fill='x', expand=True, anchor='w')
         hscrollbar.config(command=self.results_box.xview)
         # there is no other way of avoiding the overlap of scrollbars while using pack layout manager!!!
-        Label(i2, text="   ", background=self._BACKGROUND_COLOUR).pack(
-            side="left", anchor="e"
+        Label(i2, text='   ', background=self._BACKGROUND_COLOUR).pack(
+            side='left', anchor='e'
         )
-        i1.pack(side="top", fill="both", expand=True, anchor="n")
-        i2.pack(side="bottom", fill="x", anchor="s")
-        innerframe.pack(side="top", fill="both", expand=True)
+        i1.pack(side='top', fill='both', expand=True, anchor='n')
+        i2.pack(side='bottom', fill='x', anchor='s')
+        innerframe.pack(side='top', fill='both', expand=True)
 
     def _init_paging(self, parent):
         innerframe = Frame(parent, background=self._BACKGROUND_COLOUR)
         self.prev = prev = Button(
             innerframe,
-            text="Previous",
+            text='Previous',
             command=self.previous,
-            width="10",
+            width='10',
             borderwidth=1,
             highlightthickness=1,
-            state="disabled",
+            state='disabled',
         )
-        prev.pack(side="left", anchor="center")
+        prev.pack(side='left', anchor='center')
         self.next = next = Button(
             innerframe,
-            text="Next",
+            text='Next',
             command=self.__next__,
-            width="10",
+            width='10',
             borderwidth=1,
             highlightthickness=1,
-            state="disabled",
+            state='disabled',
         )
-        next.pack(side="right", anchor="center")
-        innerframe.pack(side="top", fill="y")
+        next.pack(side='right', anchor='center')
+        innerframe.pack(side='top', fill='y')
         self.reset_current_page()
 
     def reset_current_page(self):
@@ -259,14 +261,14 @@ class CollocationsView:
         self.after = self.top.after(POLL_INTERVAL, self._poll)
 
     def handle_error_loading_corpus(self, event):
-        self.status["text"] = "Error in loading " + self.var.get()
+        self.status['text'] = 'Error in loading ' + self.var.get()
         self.unfreeze_editable()
         self.clear_results_box()
         self.freeze_editable()
         self.reset_current_page()
 
     def handle_corpus_loaded(self, event):
-        self.status["text"] = self.var.get() + " is loaded"
+        self.status['text'] = self.var.get() + ' is loaded'
         self.unfreeze_editable()
         self.clear_results_box()
         self.reset_current_page()
@@ -297,22 +299,22 @@ class CollocationsView:
 
     def load_corpus(self, selection):
         if self.model.selected_corpus != selection:
-            self.status["text"] = "Loading " + selection + "..."
+            self.status['text'] = 'Loading ' + selection + '...'
             self.freeze_editable()
             self.model.load_corpus(selection)
 
     def freeze_editable(self):
-        self.prev["state"] = "disabled"
-        self.next["state"] = "disabled"
+        self.prev['state'] = 'disabled'
+        self.next['state'] = 'disabled'
 
     def clear_results_box(self):
-        self.results_box["state"] = "normal"
+        self.results_box['state'] = 'normal'
         self.results_box.delete("1.0", END)
-        self.results_box["state"] = "disabled"
+        self.results_box['state'] = 'disabled'
 
     def fire_event(self, event):
         # Firing an event so that rendering of widgets happen in the mainloop thread
-        self.top.event_generate(event, when="tail")
+        self.top.event_generate(event, when='tail')
 
     def destroy(self, *e):
         if self.top is None:
@@ -331,21 +333,21 @@ class CollocationsView:
 
     def set_paging_button_states(self):
         if self.current_page == -1 or self.current_page == 0:
-            self.prev["state"] = "disabled"
+            self.prev['state'] = 'disabled'
         else:
-            self.prev["state"] = "normal"
+            self.prev['state'] = 'normal'
         if self.model.is_last_page(self.current_page):
-            self.next["state"] = "disabled"
+            self.next['state'] = 'disabled'
         else:
-            self.next["state"] = "normal"
+            self.next['state'] = 'normal'
 
     def write_results(self, results):
-        self.results_box["state"] = "normal"
+        self.results_box['state'] = 'normal'
         row = 1
         for each in results:
-            self.results_box.insert(str(row) + ".0", each[0] + " " + each[1] + "\n")
+            self.results_box.insert(str(row) + '.0', each[0] + " " + each[1] + "\n")
             row += 1
-        self.results_box["state"] = "disabled"
+        self.results_box['state'] = 'disabled'
 
 
 class CollocationsModel:
@@ -434,7 +436,7 @@ def app():
     c.mainloop()
 
 
-if __name__ == "__main__":
+if __name__ == '__main__':
     app()
 
-__all__ = ["app"]
+__all__ = ['app']
index afdef61..a7f55d3 100644 (file)
@@ -1,6 +1,6 @@
 # Natural Language Toolkit: Concordance Application
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Sumukh Ghodke <sghodke@csse.unimelb.edu.au>
 # URL: <http://nltk.org/>
 # For license information, see LICENSE.TXT
@@ -8,9 +8,9 @@
 import re
 import threading
 
-import queue as q
-from tkinter.font import Font
-from tkinter import (
+from six.moves import queue as q
+from six.moves.tkinter_font import Font
+from six.moves.tkinter import (
     Tk,
     Button,
     END,
@@ -27,6 +27,7 @@ from tkinter import (
     Text,
 )
 
+import nltk.compat
 from nltk.corpus import (
     cess_cat,
     brown,
@@ -42,89 +43,89 @@ from nltk.corpus import (
 from nltk.util import in_idle
 from nltk.draw.util import ShowText
 
-WORD_OR_TAG = "[^/ ]+"
-BOUNDARY = r"\b"
+WORD_OR_TAG = '[^/ ]+'
+BOUNDARY = r'\b'
 
-CORPUS_LOADED_EVENT = "<<CL_EVENT>>"
-SEARCH_TERMINATED_EVENT = "<<ST_EVENT>>"
-SEARCH_ERROR_EVENT = "<<SE_EVENT>>"
-ERROR_LOADING_CORPUS_EVENT = "<<ELC_EVENT>>"
+CORPUS_LOADED_EVENT = '<<CL_EVENT>>'
+SEARCH_TERMINATED_EVENT = '<<ST_EVENT>>'
+SEARCH_ERROR_EVENT = '<<SE_EVENT>>'
+ERROR_LOADING_CORPUS_EVENT = '<<ELC_EVENT>>'
 
 POLL_INTERVAL = 50
 
 # NB All corpora must be specified in a lambda expression so as not to be
 # loaded when the module is imported.
 
-_DEFAULT = "English: Brown Corpus (Humor, simplified)"
+_DEFAULT = 'English: Brown Corpus (Humor, simplified)'
 _CORPORA = {
-    "Catalan: CESS-CAT Corpus (simplified)": lambda: cess_cat.tagged_sents(
-        tagset="universal"
+    'Catalan: CESS-CAT Corpus (simplified)': lambda: cess_cat.tagged_sents(
+        tagset='universal'
     ),
-    "English: Brown Corpus": lambda: brown.tagged_sents(),
-    "English: Brown Corpus (simplified)": lambda: brown.tagged_sents(
-        tagset="universal"
+    'English: Brown Corpus': lambda: brown.tagged_sents(),
+    'English: Brown Corpus (simplified)': lambda: brown.tagged_sents(
+        tagset='universal'
     ),
-    "English: Brown Corpus (Press, simplified)": lambda: brown.tagged_sents(
-        categories=["news", "editorial", "reviews"], tagset="universal"
+    'English: Brown Corpus (Press, simplified)': lambda: brown.tagged_sents(
+        categories=['news', 'editorial', 'reviews'], tagset='universal'
     ),
-    "English: Brown Corpus (Religion, simplified)": lambda: brown.tagged_sents(
-        categories="religion", tagset="universal"
+    'English: Brown Corpus (Religion, simplified)': lambda: brown.tagged_sents(
+        categories='religion', tagset='universal'
     ),
-    "English: Brown Corpus (Learned, simplified)": lambda: brown.tagged_sents(
-        categories="learned", tagset="universal"
+    'English: Brown Corpus (Learned, simplified)': lambda: brown.tagged_sents(
+        categories='learned', tagset='universal'
     ),
-    "English: Brown Corpus (Science Fiction, simplified)": lambda: brown.tagged_sents(
-        categories="science_fiction", tagset="universal"
+    'English: Brown Corpus (Science Fiction, simplified)': lambda: brown.tagged_sents(
+        categories='science_fiction', tagset='universal'
     ),
-    "English: Brown Corpus (Romance, simplified)": lambda: brown.tagged_sents(
-        categories="romance", tagset="universal"
+    'English: Brown Corpus (Romance, simplified)': lambda: brown.tagged_sents(
+        categories='romance', tagset='universal'
     ),
-    "English: Brown Corpus (Humor, simplified)": lambda: brown.tagged_sents(
-        categories="humor", tagset="universal"
+    'English: Brown Corpus (Humor, simplified)': lambda: brown.tagged_sents(
+        categories='humor', tagset='universal'
     ),
-    "English: NPS Chat Corpus": lambda: nps_chat.tagged_posts(),
-    "English: NPS Chat Corpus (simplified)": lambda: nps_chat.tagged_posts(
-        tagset="universal"
+    'English: NPS Chat Corpus': lambda: nps_chat.tagged_posts(),
+    'English: NPS Chat Corpus (simplified)': lambda: nps_chat.tagged_posts(
+        tagset='universal'
     ),
-    "English: Wall Street Journal Corpus": lambda: treebank.tagged_sents(),
-    "English: Wall Street Journal Corpus (simplified)": lambda: treebank.tagged_sents(
-        tagset="universal"
+    'English: Wall Street Journal Corpus': lambda: treebank.tagged_sents(),
+    'English: Wall Street Journal Corpus (simplified)': lambda: treebank.tagged_sents(
+        tagset='universal'
     ),
-    "Chinese: Sinica Corpus": lambda: sinica_treebank.tagged_sents(),
-    "Chinese: Sinica Corpus (simplified)": lambda: sinica_treebank.tagged_sents(
-        tagset="universal"
+    'Chinese: Sinica Corpus': lambda: sinica_treebank.tagged_sents(),
+    'Chinese: Sinica Corpus (simplified)': lambda: sinica_treebank.tagged_sents(
+        tagset='universal'
     ),
-    "Dutch: Alpino Corpus": lambda: alpino.tagged_sents(),
-    "Dutch: Alpino Corpus (simplified)": lambda: alpino.tagged_sents(
-        tagset="universal"
+    'Dutch: Alpino Corpus': lambda: alpino.tagged_sents(),
+    'Dutch: Alpino Corpus (simplified)': lambda: alpino.tagged_sents(
+        tagset='universal'
     ),
-    "Hindi: Indian Languages Corpus": lambda: indian.tagged_sents(files="hindi.pos"),
-    "Hindi: Indian Languages Corpus (simplified)": lambda: indian.tagged_sents(
-        files="hindi.pos", tagset="universal"
+    'Hindi: Indian Languages Corpus': lambda: indian.tagged_sents(files='hindi.pos'),
+    'Hindi: Indian Languages Corpus (simplified)': lambda: indian.tagged_sents(
+        files='hindi.pos', tagset='universal'
     ),
-    "Portuguese: Floresta Corpus (Portugal)": lambda: floresta.tagged_sents(),
-    "Portuguese: Floresta Corpus (Portugal, simplified)": lambda: floresta.tagged_sents(
-        tagset="universal"
+    'Portuguese: Floresta Corpus (Portugal)': lambda: floresta.tagged_sents(),
+    'Portuguese: Floresta Corpus (Portugal, simplified)': lambda: floresta.tagged_sents(
+        tagset='universal'
     ),
-    "Portuguese: MAC-MORPHO Corpus (Brazil)": lambda: mac_morpho.tagged_sents(),
-    "Portuguese: MAC-MORPHO Corpus (Brazil, simplified)": lambda: mac_morpho.tagged_sents(
-        tagset="universal"
+    'Portuguese: MAC-MORPHO Corpus (Brazil)': lambda: mac_morpho.tagged_sents(),
+    'Portuguese: MAC-MORPHO Corpus (Brazil, simplified)': lambda: mac_morpho.tagged_sents(
+        tagset='universal'
     ),
-    "Spanish: CESS-ESP Corpus (simplified)": lambda: cess_esp.tagged_sents(
-        tagset="universal"
+    'Spanish: CESS-ESP Corpus (simplified)': lambda: cess_esp.tagged_sents(
+        tagset='universal'
     ),
 }
 
 
 class ConcordanceSearchView(object):
-    _BACKGROUND_COLOUR = "#FFF"  # white
+    _BACKGROUND_COLOUR = '#FFF'  # white
 
     # Colour of highlighted results
-    _HIGHLIGHT_WORD_COLOUR = "#F00"  # red
-    _HIGHLIGHT_WORD_TAG = "HL_WRD_TAG"
+    _HIGHLIGHT_WORD_COLOUR = '#F00'  # red
+    _HIGHLIGHT_WORD_TAG = 'HL_WRD_TAG'
 
-    _HIGHLIGHT_LABEL_COLOUR = "#C0C0C0"  # dark grey
-    _HIGHLIGHT_LABEL_TAG = "HL_LBL_TAG"
+    _HIGHLIGHT_LABEL_COLOUR = '#C0C0C0'  # dark grey
+    _HIGHLIGHT_LABEL_TAG = 'HL_LBL_TAG'
 
     # Percentage of text left of the scrollbar position
     _FRACTION_LEFT_TEXT = 0.30
@@ -140,10 +141,10 @@ class ConcordanceSearchView(object):
         self.after = self.top.after(POLL_INTERVAL, self._poll)
 
     def _init_top(self, top):
-        top.geometry("950x680+50+50")
-        top.title("NLTK Concordance Search")
-        top.bind("<Control-q>", self.destroy)
-        top.protocol("WM_DELETE_WINDOW", self.destroy)
+        top.geometry('950x680+50+50')
+        top.title('NLTK Concordance Search')
+        top.bind('<Control-q>', self.destroy)
+        top.protocol('WM_DELETE_WINDOW', self.destroy)
         top.minsize(950, 680)
 
     def _init_widgets(self, parent):
@@ -155,7 +156,7 @@ class ConcordanceSearchView(object):
         self._init_results_box(self.main_frame)
         self._init_paging(self.main_frame)
         self._init_status(self.main_frame)
-        self.main_frame.pack(fill="both", expand=True)
+        self.main_frame.pack(fill='both', expand=True)
 
     def _init_menubar(self):
         self._result_size = IntVar(self.top)
@@ -165,90 +166,90 @@ class ConcordanceSearchView(object):
 
         filemenu = Menu(menubar, tearoff=0, borderwidth=0)
         filemenu.add_command(
-            label="Exit", underline=1, command=self.destroy, accelerator="Ctrl-q"
+            label='Exit', underline=1, command=self.destroy, accelerator='Ctrl-q'
         )
-        menubar.add_cascade(label="File", underline=0, menu=filemenu)
+        menubar.add_cascade(label='File', underline=0, menu=filemenu)
 
         editmenu = Menu(menubar, tearoff=0)
         rescntmenu = Menu(editmenu, tearoff=0)
         rescntmenu.add_radiobutton(
-            label="20",
+            label='20',
             variable=self._result_size,
             underline=0,
             value=20,
             command=self.set_result_size,
         )
         rescntmenu.add_radiobutton(
-            label="50",
+            label='50',
             variable=self._result_size,
             underline=0,
             value=50,
             command=self.set_result_size,
         )
         rescntmenu.add_radiobutton(
-            label="100",
+            label='100',
             variable=self._result_size,
             underline=0,
             value=100,
             command=self.set_result_size,
         )
         rescntmenu.invoke(1)
-        editmenu.add_cascade(label="Result Count", underline=0, menu=rescntmenu)
+        editmenu.add_cascade(label='Result Count', underline=0, menu=rescntmenu)
 
         cntxmenu = Menu(editmenu, tearoff=0)
         cntxbfmenu = Menu(cntxmenu, tearoff=0)
         cntxbfmenu.add_radiobutton(
-            label="60 characters",
+            label='60 characters',
             variable=self._cntx_bf_len,
             underline=0,
             value=60,
             command=self.set_cntx_bf_len,
         )
         cntxbfmenu.add_radiobutton(
-            label="80 characters",
+            label='80 characters',
             variable=self._cntx_bf_len,
             underline=0,
             value=80,
             command=self.set_cntx_bf_len,
         )
         cntxbfmenu.add_radiobutton(
-            label="100 characters",
+            label='100 characters',
             variable=self._cntx_bf_len,
             underline=0,
             value=100,
             command=self.set_cntx_bf_len,
         )
         cntxbfmenu.invoke(1)
-        cntxmenu.add_cascade(label="Before", underline=0, menu=cntxbfmenu)
+        cntxmenu.add_cascade(label='Before', underline=0, menu=cntxbfmenu)
 
         cntxafmenu = Menu(cntxmenu, tearoff=0)
         cntxafmenu.add_radiobutton(
-            label="70 characters",
+            label='70 characters',
             variable=self._cntx_af_len,
             underline=0,
             value=70,
             command=self.set_cntx_af_len,
         )
         cntxafmenu.add_radiobutton(
-            label="90 characters",
+            label='90 characters',
             variable=self._cntx_af_len,
             underline=0,
             value=90,
             command=self.set_cntx_af_len,
         )
         cntxafmenu.add_radiobutton(
-            label="110 characters",
+            label='110 characters',
             variable=self._cntx_af_len,
             underline=0,
             value=110,
             command=self.set_cntx_af_len,
         )
         cntxafmenu.invoke(1)
-        cntxmenu.add_cascade(label="After", underline=0, menu=cntxafmenu)
+        cntxmenu.add_cascade(label='After', underline=0, menu=cntxafmenu)
 
-        editmenu.add_cascade(label="Context", underline=0, menu=cntxmenu)
+        editmenu.add_cascade(label='Context', underline=0, menu=cntxmenu)
 
-        menubar.add_cascade(label="Edit", underline=0, menu=editmenu)
+        menubar.add_cascade(label='Edit', underline=0, menu=editmenu)
 
         self.top.config(menu=menubar)
 
@@ -268,12 +269,12 @@ class ConcordanceSearchView(object):
         Label(
             innerframe,
             justify=LEFT,
-            text=" Corpus: ",
+            text=' Corpus: ',
             background=self._BACKGROUND_COLOUR,
             padx=2,
             pady=1,
             border=0,
-        ).pack(side="left")
+        ).pack(side='left')
 
         other_corpora = list(self.model.CORPORA.keys()).remove(
             self.model.DEFAULT_CORPUS
@@ -285,10 +286,10 @@ class ConcordanceSearchView(object):
             command=self.corpus_selected,
             *self.model.non_default_corpora()
         )
-        om["borderwidth"] = 0
-        om["highlightthickness"] = 1
-        om.pack(side="left")
-        innerframe.pack(side="top", fill="x", anchor="n")
+        om['borderwidth'] = 0
+        om['highlightthickness'] = 1
+        om.pack(side='left')
+        innerframe.pack(side='top', fill='x', anchor='n')
 
     def _init_status(self, parent):
         self.status = Label(
@@ -300,24 +301,24 @@ class ConcordanceSearchView(object):
             padx=1,
             pady=0,
         )
-        self.status.pack(side="top", anchor="sw")
+        self.status.pack(side='top', anchor='sw')
 
     def _init_query_box(self, parent):
         innerframe = Frame(parent, background=self._BACKGROUND_COLOUR)
         another = Frame(innerframe, background=self._BACKGROUND_COLOUR)
         self.query_box = Entry(another, width=60)
-        self.query_box.pack(side="left", fill="x", pady=25, anchor="center")
+        self.query_box.pack(side='left', fill='x', pady=25, anchor='center')
         self.search_button = Button(
             another,
-            text="Search",
+            text='Search',
             command=self.search,
             borderwidth=1,
             highlightthickness=1,
         )
-        self.search_button.pack(side="left", fill="x", pady=25, anchor="center")
-        self.query_box.bind("<KeyPress-Return>", self.search_enter_keypress_handler)
+        self.search_button.pack(side='left', fill='x', pady=25, anchor='center')
+        self.query_box.bind('<KeyPress-Return>', self.search_enter_keypress_handler)
         another.pack()
-        innerframe.pack(side="top", fill="x", anchor="n")
+        innerframe.pack(side='top', fill='x', anchor='n')
 
     def search_enter_keypress_handler(self, *event):
         self.search()
@@ -327,61 +328,61 @@ class ConcordanceSearchView(object):
         i1 = Frame(innerframe)
         i2 = Frame(innerframe)
         vscrollbar = Scrollbar(i1, borderwidth=1)
-        hscrollbar = Scrollbar(i2, borderwidth=1, orient="horiz")
+        hscrollbar = Scrollbar(i2, borderwidth=1, orient='horiz')
         self.results_box = Text(
             i1,
-            font=Font(family="courier", size="16"),
-            state="disabled",
+            font=Font(family='courier', size='16'),
+            state='disabled',
             borderwidth=1,
             yscrollcommand=vscrollbar.set,
             xscrollcommand=hscrollbar.set,
-            wrap="none",
-            width="40",
-            height="20",
+            wrap='none',
+            width='40',
+            height='20',
             exportselection=1,
         )
-        self.results_box.pack(side="left", fill="both", expand=True)
+        self.results_box.pack(side='left', fill='both', expand=True)
         self.results_box.tag_config(
             self._HIGHLIGHT_WORD_TAG, foreground=self._HIGHLIGHT_WORD_COLOUR
         )
         self.results_box.tag_config(
             self._HIGHLIGHT_LABEL_TAG, foreground=self._HIGHLIGHT_LABEL_COLOUR
         )
-        vscrollbar.pack(side="left", fill="y", anchor="e")
+        vscrollbar.pack(side='left', fill='y', anchor='e')
         vscrollbar.config(command=self.results_box.yview)
-        hscrollbar.pack(side="left", fill="x", expand=True, anchor="w")
+        hscrollbar.pack(side='left', fill='x', expand=True, anchor='w')
         hscrollbar.config(command=self.results_box.xview)
         # there is no other way of avoiding the overlap of scrollbars while using pack layout manager!!!
-        Label(i2, text="   ", background=self._BACKGROUND_COLOUR).pack(
-            side="left", anchor="e"
+        Label(i2, text='   ', background=self._BACKGROUND_COLOUR).pack(
+            side='left', anchor='e'
         )
-        i1.pack(side="top", fill="both", expand=True, anchor="n")
-        i2.pack(side="bottom", fill="x", anchor="s")
-        innerframe.pack(side="top", fill="both", expand=True)
+        i1.pack(side='top', fill='both', expand=True, anchor='n')
+        i2.pack(side='bottom', fill='x', anchor='s')
+        innerframe.pack(side='top', fill='both', expand=True)
 
     def _init_paging(self, parent):
         innerframe = Frame(parent, background=self._BACKGROUND_COLOUR)
         self.prev = prev = Button(
             innerframe,
-            text="Previous",
+            text='Previous',
             command=self.previous,
-            width="10",
+            width='10',
             borderwidth=1,
             highlightthickness=1,
-            state="disabled",
+            state='disabled',
         )
-        prev.pack(side="left", anchor="center")
+        prev.pack(side='left', anchor='center')
         self.next = next = Button(
             innerframe,
-            text="Next",
+            text='Next',
             command=self.__next__,
-            width="10",
+            width='10',
             borderwidth=1,
             highlightthickness=1,
-            state="disabled",
+            state='disabled',
         )
-        next.pack(side="right", anchor="center")
-        innerframe.pack(side="top", fill="y")
+        next.pack(side='right', anchor='center')
+        innerframe.pack(side='top', fill='y')
         self.current_page = 0
 
     def previous(self):
@@ -396,9 +397,9 @@ class ConcordanceSearchView(object):
 
     def about(self, *e):
         ABOUT = "NLTK Concordance Search Demo\n"
-        TITLE = "About: NLTK Concordance Search Demo"
+        TITLE = 'About: NLTK Concordance Search Demo'
         try:
-            from tkinter.messagebox import Message
+            from six.moves.tkinter_messagebox import Message
 
             Message(message=ABOUT, title=TITLE, parent=self.main_frame).show()
         except:
@@ -427,13 +428,13 @@ class ConcordanceSearchView(object):
         self.after = self.top.after(POLL_INTERVAL, self._poll)
 
     def handle_error_loading_corpus(self, event):
-        self.status["text"] = "Error in loading " + self.var.get()
+        self.status['text'] = 'Error in loading ' + self.var.get()
         self.unfreeze_editable()
         self.clear_all()
         self.freeze_editable()
 
     def handle_corpus_loaded(self, event):
-        self.status["text"] = self.var.get() + " is loaded"
+        self.status['text'] = self.var.get() + ' is loaded'
         self.unfreeze_editable()
         self.clear_all()
         self.query_box.focus_set()
@@ -442,16 +443,16 @@ class ConcordanceSearchView(object):
         # todo: refactor the model such that it is less state sensitive
         results = self.model.get_results()
         self.write_results(results)
-        self.status["text"] = ""
+        self.status['text'] = ''
         if len(results) == 0:
-            self.status["text"] = "No results found for " + self.model.query
+            self.status['text'] = 'No results found for ' + self.model.query
         else:
             self.current_page = self.model.last_requested_page
         self.unfreeze_editable()
         self.results_box.xview_moveto(self._FRACTION_LEFT_TEXT)
 
     def handle_search_error(self, event):
-        self.status["text"] = "Error in query " + self.model.query
+        self.status['text'] = 'Error in query ' + self.model.query
         self.unfreeze_editable()
 
     def corpus_selected(self, *args):
@@ -460,7 +461,7 @@ class ConcordanceSearchView(object):
 
     def load_corpus(self, selection):
         if self.model.selected_corpus != selection:
-            self.status["text"] = "Loading " + selection + "..."
+            self.status['text'] = 'Loading ' + selection + '...'
             self.freeze_editable()
             self.model.load_corpus(selection)
 
@@ -471,12 +472,12 @@ class ConcordanceSearchView(object):
         query = self.query_box.get()
         if len(query.strip()) == 0:
             return
-        self.status["text"] = "Searching for " + query
+        self.status['text'] = 'Searching for ' + query
         self.freeze_editable()
         self.model.search(query, self.current_page + 1)
 
     def write_results(self, results):
-        self.results_box["state"] = "normal"
+        self.results_box['state'] = 'normal'
         row = 1
         for each in results:
             sent, pos1, pos2 = each[0].strip(), each[1], each[2]
@@ -485,34 +486,34 @@ class ConcordanceSearchView(object):
                     sent, pos1, pos2 = self.pad(sent, pos1, pos2)
                 sentence = sent[pos1 - self._char_before : pos1 + self._char_after]
                 if not row == len(results):
-                    sentence += "\n"
-                self.results_box.insert(str(row) + ".0", sentence)
+                    sentence += '\n'
+                self.results_box.insert(str(row) + '.0', sentence)
                 word_markers, label_markers = self.words_and_labels(sent, pos1, pos2)
                 for marker in word_markers:
                     self.results_box.tag_add(
                         self._HIGHLIGHT_WORD_TAG,
-                        str(row) + "." + str(marker[0]),
-                        str(row) + "." + str(marker[1]),
+                        str(row) + '.' + str(marker[0]),
+                        str(row) + '.' + str(marker[1]),
                     )
                 for marker in label_markers:
                     self.results_box.tag_add(
                         self._HIGHLIGHT_LABEL_TAG,
-                        str(row) + "." + str(marker[0]),
-                        str(row) + "." + str(marker[1]),
+                        str(row) + '.' + str(marker[0]),
+                        str(row) + '.' + str(marker[1]),
                     )
                 row += 1
-        self.results_box["state"] = "disabled"
+        self.results_box['state'] = 'disabled'
 
     def words_and_labels(self, sentence, pos1, pos2):
         search_exp = sentence[pos1:pos2]
         words, labels = [], []
-        labeled_words = search_exp.split(" ")
+        labeled_words = search_exp.split(' ')
         index = 0
         for each in labeled_words:
-            if each == "":
+            if each == '':
                 index += 1
             else:
-                word, label = each.split("/")
+                word, label = each.split('/')
                 words.append(
                     (self._char_before + index, self._char_before + index + len(word))
                 )
@@ -528,7 +529,7 @@ class ConcordanceSearchView(object):
         if hstart >= self._char_before:
             return sent, hstart, hend
         d = self._char_before - hstart
-        sent = "".join([" "] * d) + sent
+        sent = ''.join([' '] * d) + sent
         return sent, hstart + d, hend + d
 
     def destroy(self, *e):
@@ -544,34 +545,34 @@ class ConcordanceSearchView(object):
         self.clear_results_box()
 
     def clear_results_box(self):
-        self.results_box["state"] = "normal"
+        self.results_box['state'] = 'normal'
         self.results_box.delete("1.0", END)
-        self.results_box["state"] = "disabled"
+        self.results_box['state'] = 'disabled'
 
     def freeze_editable(self):
-        self.query_box["state"] = "disabled"
-        self.search_button["state"] = "disabled"
-        self.prev["state"] = "disabled"
-        self.next["state"] = "disabled"
+        self.query_box['state'] = 'disabled'
+        self.search_button['state'] = 'disabled'
+        self.prev['state'] = 'disabled'
+        self.next['state'] = 'disabled'
 
     def unfreeze_editable(self):
-        self.query_box["state"] = "normal"
-        self.search_button["state"] = "normal"
+        self.query_box['state'] = 'normal'
+        self.search_button['state'] = 'normal'
         self.set_paging_button_states()
 
     def set_paging_button_states(self):
         if self.current_page == 0 or self.current_page == 1:
-            self.prev["state"] = "disabled"
+            self.prev['state'] = 'disabled'
         else:
-            self.prev["state"] = "normal"
+            self.prev['state'] = 'normal'
         if self.model.has_more_pages(self.current_page):
-            self.next["state"] = "normal"
+            self.next['state'] = 'normal'
         else:
-            self.next["state"] = "disabled"
+            self.next['state'] = 'disabled'
 
     def fire_event(self, event):
         # Firing an event so that rendering of widgets happen in the mainloop thread
-        self.top.event_generate(event, when="tail")
+        self.top.event_generate(event, when='tail')
 
     def mainloop(self, *args, **kwargs):
         if in_idle():
@@ -649,7 +650,7 @@ class ConcordanceSearchModel(object):
             try:
                 ts = self.model.CORPORA[self.name]()
                 self.model.tagged_sents = [
-                    " ".join(w + "/" + t for (w, t) in sent) for sent in ts
+                    ' '.join(w + '/' + t for (w, t) in sent) for sent in ts
                 ]
                 self.model.queue.put(CORPUS_LOADED_EVENT)
             except Exception as e:
@@ -689,14 +690,14 @@ class ConcordanceSearchModel(object):
         def processed_query(self):
             new = []
             for term in self.model.query.split():
-                term = re.sub(r"\.", r"[^/ ]", term)
-                if re.match("[A-Z]+$", term):
-                    new.append(BOUNDARY + WORD_OR_TAG + "/" + term + BOUNDARY)
-                elif "/" in term:
+                term = re.sub(r'\.', r'[^/ ]', term)
+                if re.match('[A-Z]+$', term):
+                    new.append(BOUNDARY + WORD_OR_TAG + '/' + term + BOUNDARY)
+                elif '/' in term:
                     new.append(BOUNDARY + term + BOUNDARY)
                 else:
-                    new.append(BOUNDARY + term + "/" + WORD_OR_TAG + BOUNDARY)
-            return " ".join(new)
+                    new.append(BOUNDARY + term + '/' + WORD_OR_TAG + BOUNDARY)
+            return ' '.join(new)
 
 
 def app():
@@ -704,7 +705,7 @@ def app():
     d.mainloop()
 
 
-if __name__ == "__main__":
+if __name__ == '__main__':
     app()
 
-__all__ = ["app"]
+__all__ = ['app']
index e653ea1..639f767 100644 (file)
@@ -10,7 +10,7 @@ Created by Aristide Grange
 import re
 import itertools
 
-from tkinter import (
+from six.moves.tkinter import (
     Frame,
     Label,
     PhotoImage,
@@ -62,8 +62,8 @@ class Zone:
     def __init__(self, image, initialField, initialText):
         frm = Frame(root)
         frm.config(background="white")
-        self.image = PhotoImage(format="gif", data=images[image.upper()])
-        self.imageDimmed = PhotoImage(format="gif", data=images[image])
+        self.image = PhotoImage(format='gif', data=images[image.upper()])
+        self.imageDimmed = PhotoImage(format='gif', data=images[image])
         self.img = Label(frm)
         self.img.config(borderwidth=0)
         self.img.pack(side="left")
@@ -168,7 +168,7 @@ def app():
     root.mainloop()
 
 
-if __name__ == "__main__":
+if __name__ == '__main__':
     app()
 
-__all__ = ["app"]
+__all__ = ['app']
index 9437bff..5d3054d 100644 (file)
@@ -1,6 +1,6 @@
 # Natural Language Toolkit: Recursive Descent Parser Application
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Edward Loper <edloper@gmail.com>
 # URL: <http://nltk.org/>
 # For license information, see LICENSE.TXT
@@ -63,9 +63,10 @@ Keyboard Shortcuts::
       [Ctrl-p]\t Print
       [q]\t Quit
 """
+from __future__ import division
 
-from tkinter.font import Font
-from tkinter import Listbox, IntVar, Button, Frame, Label, Menu, Scrollbar, Tk
+from six.moves.tkinter_font import Font
+from six.moves.tkinter import Listbox, IntVar, Button, Frame, Label, Menu, Scrollbar, Tk
 
 from nltk.tree import Tree
 from nltk.util import in_idle
@@ -91,7 +92,7 @@ class RecursiveDescentApp(object):
 
         # Set up the main window.
         self._top = Tk()
-        self._top.title("Recursive Descent Parser Application")
+        self._top.title('Recursive Descent Parser Application')
 
         # Set up key bindings.
         self._init_bindings()
@@ -121,7 +122,7 @@ class RecursiveDescentApp(object):
         self._parser.initialize(self._sent)
 
         # Resize callback
-        self._canvas.bind("<Configure>", self._configure)
+        self._canvas.bind('<Configure>', self._configure)
 
     #########################################
     ##  Initialization Helpers
@@ -134,127 +135,127 @@ class RecursiveDescentApp(object):
 
         # TWhat's our font size (default=same as sysfont)
         self._size = IntVar(root)
-        self._size.set(self._sysfont.cget("size"))
+        self._size.set(self._sysfont.cget('size'))
 
-        self._boldfont = Font(family="helvetica", weight="bold", size=self._size.get())
-        self._font = Font(family="helvetica", size=self._size.get())
+        self._boldfont = Font(family='helvetica', weight='bold', size=self._size.get())
+        self._font = Font(family='helvetica', size=self._size.get())
         if self._size.get() < 0:
             big = self._size.get() - 2
         else:
             big = self._size.get() + 2
-        self._bigfont = Font(family="helvetica", weight="bold", size=big)
+        self._bigfont = Font(family='helvetica', weight='bold', size=big)
 
     def _init_grammar(self, parent):
         # Grammar view.
         self._prodframe = listframe = Frame(parent)
-        self._prodframe.pack(fill="both", side="left", padx=2)
+        self._prodframe.pack(fill='both', side='left', padx=2)
         self._prodlist_label = Label(
-            self._prodframe, font=self._boldfont, text="Available Expansions"
+            self._prodframe, font=self._boldfont, text='Available Expansions'
         )
         self._prodlist_label.pack()
         self._prodlist = Listbox(
             self._prodframe,
-            selectmode="single",
-            relief="groove",
-            background="white",
-            foreground="#909090",
+            selectmode='single',
+            relief='groove',
+            background='white',
+            foreground='#909090',
             font=self._font,
-            selectforeground="#004040",
-            selectbackground="#c0f0c0",
+            selectforeground='#004040',
+            selectbackground='#c0f0c0',
         )
 
-        self._prodlist.pack(side="right", fill="both", expand=1)
+        self._prodlist.pack(side='right', fill='both', expand=1)
 
         self._productions = list(self._parser.grammar().productions())
         for production in self._productions:
-            self._prodlist.insert("end", ("  %s" % production))
+            self._prodlist.insert('end', ('  %s' % production))
         self._prodlist.config(height=min(len(self._productions), 25))
 
         # Add a scrollbar if there are more than 25 productions.
         if len(self._productions) > 25:
-            listscroll = Scrollbar(self._prodframe, orient="vertical")
+            listscroll = Scrollbar(self._prodframe, orient='vertical')
             self._prodlist.config(yscrollcommand=listscroll.set)
             listscroll.config(command=self._prodlist.yview)
-            listscroll.pack(side="left", fill="y")
+            listscroll.pack(side='left', fill='y')
 
         # If they select a production, apply it.
-        self._prodlist.bind("<<ListboxSelect>>", self._prodlist_select)
+        self._prodlist.bind('<<ListboxSelect>>', self._prodlist_select)
 
     def _init_bindings(self):
         # Key bindings are a good thing.
-        self._top.bind("<Control-q>", self.destroy)
-        self._top.bind("<Control-x>", self.destroy)
-        self._top.bind("<Escape>", self.destroy)
-        self._top.bind("e", self.expand)
+        self._top.bind('<Control-q>', self.destroy)
+        self._top.bind('<Control-x>', self.destroy)
+        self._top.bind('<Escape>', self.destroy)
+        self._top.bind('e', self.expand)
         # self._top.bind('<Alt-e>', self.expand)
         # self._top.bind('<Control-e>', self.expand)
-        self._top.bind("m", self.match)
-        self._top.bind("<Alt-m>", self.match)
-        self._top.bind("<Control-m>", self.match)
-        self._top.bind("b", self.backtrack)
-        self._top.bind("<Alt-b>", self.backtrack)
-        self._top.bind("<Control-b>", self.backtrack)
-        self._top.bind("<Control-z>", self.backtrack)
-        self._top.bind("<BackSpace>", self.backtrack)
-        self._top.bind("a", self.autostep)
+        self._top.bind('m', self.match)
+        self._top.bind('<Alt-m>', self.match)
+        self._top.bind('<Control-m>', self.match)
+        self._top.bind('b', self.backtrack)
+        self._top.bind('<Alt-b>', self.backtrack)
+        self._top.bind('<Control-b>', self.backtrack)
+        self._top.bind('<Control-z>', self.backtrack)
+        self._top.bind('<BackSpace>', self.backtrack)
+        self._top.bind('a', self.autostep)
         # self._top.bind('<Control-a>', self.autostep)
-        self._top.bind("<Control-space>", self.autostep)
-        self._top.bind("<Control-c>", self.cancel_autostep)
-        self._top.bind("<space>", self.step)
-        self._top.bind("<Delete>", self.reset)
-        self._top.bind("<Control-p>", self.postscript)
+        self._top.bind('<Control-space>', self.autostep)
+        self._top.bind('<Control-c>', self.cancel_autostep)
+        self._top.bind('<space>', self.step)
+        self._top.bind('<Delete>', self.reset)
+        self._top.bind('<Control-p>', self.postscript)
         # self._top.bind('<h>', self.help)
         # self._top.bind('<Alt-h>', self.help)
-        self._top.bind("<Control-h>", self.help)
-        self._top.bind("<F1>", self.help)
+        self._top.bind('<Control-h>', self.help)
+        self._top.bind('<F1>', self.help)
         # self._top.bind('<g>', self.toggle_grammar)
         # self._top.bind('<Alt-g>', self.toggle_grammar)
         # self._top.bind('<Control-g>', self.toggle_grammar)
-        self._top.bind("<Control-g>", self.edit_grammar)
-        self._top.bind("<Control-t>", self.edit_sentence)
+        self._top.bind('<Control-g>', self.edit_grammar)
+        self._top.bind('<Control-t>', self.edit_sentence)
 
     def _init_buttons(self, parent):
         # Set up the frames.
         self._buttonframe = buttonframe = Frame(parent)
-        buttonframe.pack(fill="none", side="bottom", padx=3, pady=2)
+        buttonframe.pack(fill='none', side='bottom', padx=3, pady=2)
         Button(
             buttonframe,
-            text="Step",
-            background="#90c0d0",
-            foreground="black",
+            text='Step',
+            background='#90c0d0',
+            foreground='black',
             command=self.step,
-        ).pack(side="left")
+        ).pack(side='left')
         Button(
             buttonframe,
-            text="Autostep",
-            background="#90c0d0",
-            foreground="black",
+            text='Autostep',
+            background='#90c0d0',
+            foreground='black',
             command=self.autostep,
-        ).pack(side="left")
+        ).pack(side='left')
         Button(
             buttonframe,
-            text="Expand",
+            text='Expand',
             underline=0,
-            background="#90f090",
-            foreground="black",
+            background='#90f090',
+            foreground='black',
             command=self.expand,
-        ).pack(side="left")
+        ).pack(side='left')
         Button(
             buttonframe,
-            text="Match",
+            text='Match',
             underline=0,
-            background="#90f090",
-            foreground="black",
+            background='#90f090',
+            foreground='black',
             command=self.match,
-        ).pack(side="left")
+        ).pack(side='left')
         Button(
             buttonframe,
-            text="Backtrack",
+            text='Backtrack',
             underline=0,
-            background="#f0a0a0",
-            foreground="black",
+            background='#f0a0a0',
+            foreground='black',
             command=self.backtrack,
-        ).pack(side="left")
+        ).pack(side='left')
         # Replace autostep...
 
     #         self._autostep_button = Button(buttonframe, text='Autostep',
@@ -265,42 +266,42 @@ class RecursiveDescentApp(object):
         self._autostep = 0
         (x1, y1, x2, y2) = self._cframe.scrollregion()
         y2 = event.height - 6
-        self._canvas["scrollregion"] = "%d %d %d %d" % (x1, y1, x2, y2)
+        self._canvas['scrollregion'] = '%d %d %d %d' % (x1, y1, x2, y2)
         self._redraw()
 
     def _init_feedback(self, parent):
         self._feedbackframe = feedbackframe = Frame(parent)
-        feedbackframe.pack(fill="x", side="bottom", padx=3, pady=3)
+        feedbackframe.pack(fill='x', side='bottom', padx=3, pady=3)
         self._lastoper_label = Label(
-            feedbackframe, text="Last Operation:", font=self._font
+            feedbackframe, text='Last Operation:', font=self._font
         )
-        self._lastoper_label.pack(side="left")
-        lastoperframe = Frame(feedbackframe, relief="sunken", border=1)
-        lastoperframe.pack(fill="x", side="right", expand=1, padx=5)
+        self._lastoper_label.pack(side='left')
+        lastoperframe = Frame(feedbackframe, relief='sunken', border=1)
+        lastoperframe.pack(fill='x', side='right', expand=1, padx=5)
         self._lastoper1 = Label(
-            lastoperframe, foreground="#007070", background="#f0f0f0", font=self._font
+            lastoperframe, foreground='#007070', background='#f0f0f0', font=self._font
         )
         self._lastoper2 = Label(
             lastoperframe,
-            anchor="w",
+            anchor='w',
             width=30,
-            foreground="#004040",
-            background="#f0f0f0",
+            foreground='#004040',
+            background='#f0f0f0',
             font=self._font,
         )
-        self._lastoper1.pack(side="left")
-        self._lastoper2.pack(side="left", fill="x", expand=1)
+        self._lastoper1.pack(side='left')
+        self._lastoper2.pack(side='left', fill='x', expand=1)
 
     def _init_canvas(self, parent):
         self._cframe = CanvasFrame(
             parent,
-            background="white",
+            background='white',
             # width=525, height=250,
             closeenough=10,
             border=2,
-            relief="sunken",
+            relief='sunken',
         )
-        self._cframe.pack(expand=1, fill="both", side="top", pady=2)
+        self._cframe.pack(expand=1, fill='both', side='top', pady=2)
         canvas = self._canvas = self._cframe.canvas()
 
         # Initially, there's no tree or text
@@ -313,50 +314,50 @@ class RecursiveDescentApp(object):
 
         filemenu = Menu(menubar, tearoff=0)
         filemenu.add_command(
-            label="Reset Parser", underline=0, command=self.reset, accelerator="Del"
+            label='Reset Parser', underline=0, command=self.reset, accelerator='Del'
         )
         filemenu.add_command(
-            label="Print to Postscript",
+            label='Print to Postscript',
             underline=0,
             command=self.postscript,
-            accelerator="Ctrl-p",
+            accelerator='Ctrl-p',
         )
         filemenu.add_command(
-            label="Exit", underline=1, command=self.destroy, accelerator="Ctrl-x"
+            label='Exit', underline=1, command=self.destroy, accelerator='Ctrl-x'
         )
-        menubar.add_cascade(label="File", underline=0, menu=filemenu)
+        menubar.add_cascade(label='File', underline=0, menu=filemenu)
 
         editmenu = Menu(menubar, tearoff=0)
         editmenu.add_command(
-            label="Edit Grammar",
+            label='Edit Grammar',
             underline=5,
             command=self.edit_grammar,
-            accelerator="Ctrl-g",
+            accelerator='Ctrl-g',
         )
         editmenu.add_command(
-            label="Edit Text",
+            label='Edit Text',
             underline=5,
             command=self.edit_sentence,
-            accelerator="Ctrl-t",
+            accelerator='Ctrl-t',
         )
-        menubar.add_cascade(label="Edit", underline=0, menu=editmenu)
+        menubar.add_cascade(label='Edit', underline=0, menu=editmenu)
 
         rulemenu = Menu(menubar, tearoff=0)
         rulemenu.add_command(
-            label="Step", underline=1, command=self.step, accelerator="Space"
+            label='Step', underline=1, command=self.step, accelerator='Space'
         )
         rulemenu.add_separator()
         rulemenu.add_command(
-            label="Match", underline=0, command=self.match, accelerator="Ctrl-m"
+            label='Match', underline=0, command=self.match, accelerator='Ctrl-m'
         )
         rulemenu.add_command(
-            label="Expand", underline=0, command=self.expand, accelerator="Ctrl-e"
+            label='Expand', underline=0, command=self.expand, accelerator='Ctrl-e'
         )
         rulemenu.add_separator()
         rulemenu.add_command(
-            label="Backtrack", underline=0, command=self.backtrack, accelerator="Ctrl-b"
+            label='Backtrack', underline=0, command=self.backtrack, accelerator='Ctrl-b'
         )
-        menubar.add_cascade(label="Apply", underline=0, menu=rulemenu)
+        menubar.add_cascade(label='Apply', underline=0, menu=rulemenu)
 
         viewmenu = Menu(menubar, tearoff=0)
         viewmenu.add_checkbutton(
@@ -367,41 +368,41 @@ class RecursiveDescentApp(object):
         )
         viewmenu.add_separator()
         viewmenu.add_radiobutton(
-            label="Tiny",
+            label='Tiny',
             variable=self._size,
             underline=0,
             value=10,
             command=self.resize,
         )
         viewmenu.add_radiobutton(
-            label="Small",
+            label='Small',
             variable=self._size,
             underline=0,
             value=12,
             command=self.resize,
         )
         viewmenu.add_radiobutton(
-            label="Medium",
+            label='Medium',
             variable=self._size,
             underline=0,
             value=14,
             command=self.resize,
         )
         viewmenu.add_radiobutton(
-            label="Large",
+            label='Large',
             variable=self._size,
             underline=0,
             value=18,
             command=self.resize,
         )
         viewmenu.add_radiobutton(
-            label="Huge",
+            label='Huge',
             variable=self._size,
             underline=0,
             value=24,
             command=self.resize,
         )
-        menubar.add_cascade(label="View", underline=0, menu=viewmenu)
+        menubar.add_cascade(label='View', underline=0, menu=viewmenu)
 
         animatemenu = Menu(menubar, tearoff=0)
         animatemenu.add_radiobutton(
@@ -412,30 +413,30 @@ class RecursiveDescentApp(object):
             underline=0,
             variable=self._animation_frames,
             value=10,
-            accelerator="-",
+            accelerator='-',
         )
         animatemenu.add_radiobutton(
             label="Normal Animation",
             underline=0,
             variable=self._animation_frames,
             value=5,
-            accelerator="=",
+            accelerator='=',
         )
         animatemenu.add_radiobutton(
             label="Fast Animation",
             underline=0,
             variable=self._animation_frames,
             value=2,
-            accelerator="+",
+            accelerator='+',
         )
         menubar.add_cascade(label="Animate", underline=1, menu=animatemenu)
 
         helpmenu = Menu(menubar, tearoff=0)
-        helpmenu.add_command(label="About", underline=0, command=self.about)
+        helpmenu.add_command(label='About', underline=0, command=self.about)
         helpmenu.add_command(
-            label="Instructions", underline=0, command=self.help, accelerator="F1"
+            label='Instructions', underline=0, command=self.help, accelerator='F1'
         )
-        menubar.add_cascade(label="Help", underline=0, menu=helpmenu)
+        menubar.add_cascade(label='Help', underline=0, menu=helpmenu)
 
         parent.config(menu=menubar)
 
@@ -466,20 +467,20 @@ class RecursiveDescentApp(object):
             self._canvas.delete(self._textline)
 
         # Draw the tree.
-        helv = ("helvetica", -self._size.get())
-        bold = ("helvetica", -self._size.get(), "bold")
+        helv = ('helvetica', -self._size.get())
+        bold = ('helvetica', -self._size.get(), 'bold')
         attribs = {
-            "tree_color": "#000000",
-            "tree_width": 2,
-            "node_font": bold,
-            "leaf_font": helv,
+            'tree_color': '#000000',
+            'tree_width': 2,
+            'node_font': bold,
+            'leaf_font': helv,
         }
         tree = self._parser.tree()
         self._tree = tree_to_treesegment(canvas, tree, **attribs)
         self._cframe.add_widget(self._tree, 30, 5)
 
         # Draw the text.
-        helv = ("helvetica", -self._size.get())
+        helv = ('helvetica', -self._size.get())
         bottom = y = self._cframe.scrollregion()[3]
         self._textwidgets = [
             TextWidget(canvas, word, font=self._font) for word in self._sent
@@ -490,7 +491,7 @@ class RecursiveDescentApp(object):
             y = min(y, twidget.bbox()[1])
 
         # Draw a line over the text, to separate it from the tree.
-        self._textline = canvas.create_line(-5000, y - 5, 5000, y - 5, dash=".")
+        self._textline = canvas.create_line(-5000, y - 5, 5000, y - 5, dash='.')
 
         # Highlight appropriate nodes.
         self._highlight_nodes()
@@ -507,30 +508,30 @@ class RecursiveDescentApp(object):
 
     def _highlight_nodes(self):
         # Highlight the list of nodes to be checked.
-        bold = ("helvetica", -self._size.get(), "bold")
+        bold = ('helvetica', -self._size.get(), 'bold')
         for treeloc in self._parser.frontier()[:1]:
-            self._get(self._tree, treeloc)["color"] = "#20a050"
-            self._get(self._tree, treeloc)["font"] = bold
+            self._get(self._tree, treeloc)['color'] = '#20a050'
+            self._get(self._tree, treeloc)['font'] = bold
         for treeloc in self._parser.frontier()[1:]:
-            self._get(self._tree, treeloc)["color"] = "#008080"
+            self._get(self._tree, treeloc)['color'] = '#008080'
 
     def _highlight_prodlist(self):
         # Highlight the productions that can be expanded.
         # Boy, too bad tkinter doesn't implement Listbox.itemconfig;
         # that would be pretty useful here.
-        self._prodlist.delete(0, "end")
+        self._prodlist.delete(0, 'end')
         expandable = self._parser.expandable_productions()
         untried = self._parser.untried_expandable_productions()
         productions = self._productions
         for index in range(len(productions)):
             if productions[index] in expandable:
                 if productions[index] in untried:
-                    self._prodlist.insert(index, " %s" % productions[index])
+                    self._prodlist.insert(index, ' %s' % productions[index])
                 else:
-                    self._prodlist.insert(index, " %s (TRIED)" % productions[index])
+                    self._prodlist.insert(index, ' %s (TRIED)' % productions[index])
                 self._prodlist.selection_set(index)
             else:
-                self._prodlist.insert(index, " %s" % productions[index])
+                self._prodlist.insert(index, ' %s' % productions[index])
 
     def _position_text(self):
         # Line up the text widgets that are matched against the tree
@@ -541,22 +542,22 @@ class RecursiveDescentApp(object):
         for i in range(0, len(leaves)):
             widget = self._textwidgets[i]
             leaf = leaves[i]
-            widget["color"] = "#006040"
-            leaf["color"] = "#006040"
+            widget['color'] = '#006040'
+            leaf['color'] = '#006040'
             widget.move(leaf.bbox()[0] - widget.bbox()[0], 0)
             xmax = widget.bbox()[2] + 10
 
         # Line up the text widgets that are not matched against the tree.
         for i in range(len(leaves), numwords):
             widget = self._textwidgets[i]
-            widget["color"] = "#a0a0a0"
+            widget['color'] = '#a0a0a0'
             widget.move(xmax - widget.bbox()[0], 0)
             xmax = widget.bbox()[2] + 10
 
         # If we have a complete parse, make everything green :)
         if self._parser.currently_complete():
             for twidget in self._textwidgets:
-                twidget["color"] = "#00a000"
+                twidget['color'] = '#00a000'
 
         # Move the matched leaves down to the text.
         for i in range(0, len(leaves)):
@@ -591,8 +592,8 @@ class RecursiveDescentApp(object):
     def reset(self, *e):
         self._autostep = 0
         self._parser.initialize(self._sent)
-        self._lastoper1["text"] = "Reset Application"
-        self._lastoper2["text"] = ""
+        self._lastoper1['text'] = 'Reset Application'
+        self._lastoper2['text'] = ''
         self._redraw()
 
     def autostep(self, *e):
@@ -637,14 +638,14 @@ class RecursiveDescentApp(object):
         elif self._backtrack():
             pass
         else:
-            self._lastoper1["text"] = "Finished"
-            self._lastoper2["text"] = ""
+            self._lastoper1['text'] = 'Finished'
+            self._lastoper2['text'] = ''
             self._autostep = 0
 
         # Check if we just completed a parse.
         if self._parser.currently_complete():
             self._autostep = 0
-            self._lastoper2["text"] += "    [COMPLETE PARSE]"
+            self._lastoper2['text'] += '    [COMPLETE PARSE]'
 
     def _expand(self, *e):
         if self._animating_lock:
@@ -652,16 +653,16 @@ class RecursiveDescentApp(object):
         old_frontier = self._parser.frontier()
         rv = self._parser.expand()
         if rv is not None:
-            self._lastoper1["text"] = "Expand:"
-            self._lastoper2["text"] = rv
-            self._prodlist.selection_clear(0, "end")
+            self._lastoper1['text'] = 'Expand:'
+            self._lastoper2['text'] = rv
+            self._prodlist.selection_clear(0, 'end')
             index = self._productions.index(rv)
             self._prodlist.selection_set(index)
             self._animate_expand(old_frontier[0])
             return True
         else:
-            self._lastoper1["text"] = "Expand:"
-            self._lastoper2["text"] = "(all expansions tried)"
+            self._lastoper1['text'] = 'Expand:'
+            self._lastoper2['text'] = '(all expansions tried)'
             return False
 
     def _match(self, *e):
@@ -670,13 +671,13 @@ class RecursiveDescentApp(object):
         old_frontier = self._parser.frontier()
         rv = self._parser.match()
         if rv is not None:
-            self._lastoper1["text"] = "Match:"
-            self._lastoper2["text"] = rv
+            self._lastoper1['text'] = 'Match:'
+            self._lastoper2['text'] = rv
             self._animate_match(old_frontier[0])
             return True
         else:
-            self._lastoper1["text"] = "Match:"
-            self._lastoper2["text"] = "(failed)"
+            self._lastoper1['text'] = 'Match:'
+            self._lastoper2['text'] = '(failed)'
             return False
 
     def _backtrack(self, *e):
@@ -686,8 +687,8 @@ class RecursiveDescentApp(object):
             elt = self._parser.tree()
             for i in self._parser.frontier()[0]:
                 elt = elt[i]
-            self._lastoper1["text"] = "Backtrack"
-            self._lastoper2["text"] = ""
+            self._lastoper1['text'] = 'Backtrack'
+            self._lastoper2['text'] = ''
             if isinstance(elt, Tree):
                 self._animate_backtrack(self._parser.frontier()[0])
             else:
@@ -695,17 +696,17 @@ class RecursiveDescentApp(object):
             return True
         else:
             self._autostep = 0
-            self._lastoper1["text"] = "Finished"
-            self._lastoper2["text"] = ""
+            self._lastoper1['text'] = 'Finished'
+            self._lastoper2['text'] = ''
             return False
 
     def about(self, *e):
         ABOUT = (
             "NLTK Recursive Descent Parser Application\n" + "Written by Edward Loper"
         )
-        TITLE = "About: Recursive Descent Parser Application"
+        TITLE = 'About: Recursive Descent Parser Application'
         try:
-            from tkinter.messagebox import Message
+            from six.moves.tkinter_messagebox import Message
 
             Message(message=ABOUT, title=TITLE).show()
         except:
@@ -717,16 +718,16 @@ class RecursiveDescentApp(object):
         try:
             ShowText(
                 self._top,
-                "Help: Recursive Descent Parser Application",
-                (__doc__ or "").strip(),
+                'Help: Recursive Descent Parser Application',
+                (__doc__ or '').strip(),
                 width=75,
-                font="fixed",
+                font='fixed',
             )
         except:
             ShowText(
                 self._top,
-                "Help: Recursive Descent Parser Application",
-                (__doc__ or "").strip(),
+                'Help: Recursive Descent Parser Application',
+                (__doc__ or '').strip(),
                 width=75,
             )
 
@@ -762,13 +763,13 @@ class RecursiveDescentApp(object):
     def _toggle_grammar(self, *e):
         if self._show_grammar.get():
             self._prodframe.pack(
-                fill="both", side="left", padx=2, after=self._feedbackframe
+                fill='both', side='left', padx=2, after=self._feedbackframe
             )
-            self._lastoper1["text"] = "Show Grammar"
+            self._lastoper1['text'] = 'Show Grammar'
         else:
             self._prodframe.pack_forget()
-            self._lastoper1["text"] = "Hide Grammar"
-        self._lastoper2["text"] = ""
+            self._lastoper1['text'] = 'Hide Grammar'
+        self._lastoper2['text'] = ''
 
     #     def toggle_grammar(self, *e):
     #         self._show_grammar = not self._show_grammar
@@ -790,14 +791,14 @@ class RecursiveDescentApp(object):
         production = self._parser.expand(self._productions[index])
 
         if production:
-            self._lastoper1["text"] = "Expand:"
-            self._lastoper2["text"] = production
-            self._prodlist.selection_clear(0, "end")
+            self._lastoper1['text'] = 'Expand:'
+            self._lastoper2['text'] = production
+            self._prodlist.selection_clear(0, 'end')
             self._prodlist.selection_set(index)
             self._animate_expand(old_frontier[0])
         else:
             # Reset the production selections.
-            self._prodlist.selection_clear(0, "end")
+            self._prodlist.selection_clear(0, 'end')
             for prod in self._parser.expandable_productions():
                 index = self._productions.index(prod)
                 self._prodlist.selection_set(index)
@@ -819,13 +820,13 @@ class RecursiveDescentApp(object):
             self._canvas,
             tree,
             node_font=self._boldfont,
-            leaf_color="white",
+            leaf_color='white',
             tree_width=2,
-            tree_color="white",
-            node_color="white",
+            tree_color='white',
+            node_color='white',
             leaf_font=self._font,
         )
-        widget.label()["color"] = "#20a050"
+        widget.label()['color'] = '#20a050'
 
         (oldx, oldy) = oldtree.label().bbox()[:2]
         (newx, newy) = widget.label().bbox()[:2]
@@ -858,7 +859,7 @@ class RecursiveDescentApp(object):
             oldtree.destroy()
 
         colors = [
-            "gray%d" % (10 * int(10 * x / self._animation_frames.get()))
+            'gray%d' % (10 * int(10 * x / self._animation_frames.get()))
             for x in range(self._animation_frames.get(), 0, -1)
         ]
 
@@ -900,22 +901,22 @@ class RecursiveDescentApp(object):
     def _animate_expand_frame(self, widget, colors):
         if len(colors) > 0:
             self._animating_lock = 1
-            widget["color"] = colors[0]
+            widget['color'] = colors[0]
             for subtree in widget.subtrees():
                 if isinstance(subtree, TreeSegmentWidget):
-                    subtree.label()["color"] = colors[0]
+                    subtree.label()['color'] = colors[0]
                 else:
-                    subtree["color"] = colors[0]
+                    subtree['color'] = colors[0]
             self._top.after(50, self._animate_expand_frame, widget, colors[1:])
         else:
-            widget["color"] = "black"
+            widget['color'] = 'black'
             for subtree in widget.subtrees():
                 if isinstance(subtree, TreeSegmentWidget):
-                    subtree.label()["color"] = "black"
+                    subtree.label()['color'] = 'black'
                 else:
-                    subtree["color"] = "black"
+                    subtree['color'] = 'black'
             self._redraw_quick()
-            widget.label()["color"] = "black"
+            widget.label()['color'] = 'black'
             self._animating_lock = 0
             if self._autostep:
                 self._step()
@@ -925,9 +926,9 @@ class RecursiveDescentApp(object):
         if self._animation_frames.get() == 0:
             colors = []
         else:
-            colors = ["#a00000", "#000000", "#a00000"]
+            colors = ['#a00000', '#000000', '#a00000']
         colors += [
-            "gray%d" % (10 * int(10 * x / (self._animation_frames.get())))
+            'gray%d' % (10 * int(10 * x / (self._animation_frames.get())))
             for x in range(1, self._animation_frames.get() + 1)
         ]
 
@@ -944,7 +945,7 @@ class RecursiveDescentApp(object):
         if len(colors) > 0:
             self._animating_lock = 1
             for widget in widgets:
-                widget["color"] = colors[0]
+                widget['color'] = colors[0]
             self._top.after(50, self._animate_backtrack_frame, widgets, colors[1:])
         else:
             for widget in widgets[0].subtrees():
@@ -977,7 +978,7 @@ class RecursiveDescentApp(object):
             widget.move(0, dy)
             self._top.after(10, self._animate_match_frame, frame - 1, widget, dy)
         else:
-            widget["color"] = "#006040"
+            widget['color'] = '#006040'
             self._redraw_quick()
             self._animating_lock = 0
             if self._autostep:
@@ -1003,14 +1004,14 @@ class RecursiveDescentApp(object):
     def set_grammar(self, grammar):
         self._parser.set_grammar(grammar)
         self._productions = list(grammar.productions())
-        self._prodlist.delete(0, "end")
+        self._prodlist.delete(0, 'end')
         for production in self._productions:
-            self._prodlist.insert("end", (" %s" % production))
+            self._prodlist.insert('end', (' %s' % production))
 
     def edit_sentence(self, *e):
         sentence = " ".join(self._sent)
-        title = "Edit Text"
-        instr = "Enter a new sentence to parse."
+        title = 'Edit Text'
+        instr = 'Enter a new sentence to parse.'
         EntryDialog(self._top, sentence, instr, self.set_sentence, title)
 
     def set_sentence(self, sentence):
@@ -1041,12 +1042,12 @@ def app():
     """
     )
 
-    sent = "the dog saw a man in the park".split()
+    sent = 'the dog saw a man in the park'.split()
 
     RecursiveDescentApp(grammar, sent).mainloop()
 
 
-if __name__ == "__main__":
+if __name__ == '__main__':
     app()
 
-__all__ = ["app"]
+__all__ = ['app']
index 1db15ab..1f11427 100644 (file)
@@ -1,6 +1,6 @@
 # Natural Language Toolkit: Shift-Reduce Parser Application
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Edward Loper <edloper@gmail.com>
 # URL: <http://nltk.org/>
 # For license information, see LICENSE.TXT
@@ -62,8 +62,8 @@ Keyboard Shortcuts::
 
 """
 
-from tkinter.font import Font
-from tkinter import IntVar, Listbox, Button, Frame, Label, Menu, Scrollbar, Tk
+from six.moves.tkinter_font import Font
+from six.moves.tkinter import IntVar, Listbox, Button, Frame, Label, Menu, Scrollbar, Tk
 
 from nltk.tree import Tree
 from nltk.parse import SteppingShiftReduceParser
@@ -101,7 +101,7 @@ class ShiftReduceApp(object):
 
         # Set up the main window.
         self._top = Tk()
-        self._top.title("Shift Reduce Parser Application")
+        self._top.title('Shift Reduce Parser Application')
 
         # Animations.  animating_lock is a lock to prevent the demo
         # from performing new operations while it's animating.
@@ -131,7 +131,7 @@ class ShiftReduceApp(object):
 
         # Reset the demo, and set the feedback frame to empty.
         self.reset()
-        self._lastoper1["text"] = ""
+        self._lastoper1['text'] = ''
 
     #########################################
     ##  Initialization Helpers
@@ -144,171 +144,171 @@ class ShiftReduceApp(object):
 
         # TWhat's our font size (default=same as sysfont)
         self._size = IntVar(root)
-        self._size.set(self._sysfont.cget("size"))
+        self._size.set(self._sysfont.cget('size'))
 
-        self._boldfont = Font(family="helvetica", weight="bold", size=self._size.get())
-        self._font = Font(family="helvetica", size=self._size.get())
+        self._boldfont = Font(family='helvetica', weight='bold', size=self._size.get())
+        self._font = Font(family='helvetica', size=self._size.get())
 
     def _init_grammar(self, parent):
         # Grammar view.
         self._prodframe = listframe = Frame(parent)
-        self._prodframe.pack(fill="both", side="left", padx=2)
+        self._prodframe.pack(fill='both', side='left', padx=2)
         self._prodlist_label = Label(
-            self._prodframe, font=self._boldfont, text="Available Reductions"
+            self._prodframe, font=self._boldfont, text='Available Reductions'
         )
         self._prodlist_label.pack()
         self._prodlist = Listbox(
             self._prodframe,
-            selectmode="single",
-            relief="groove",
-            background="white",
-            foreground="#909090",
+            selectmode='single',
+            relief='groove',
+            background='white',
+            foreground='#909090',
             font=self._font,
-            selectforeground="#004040",
-            selectbackground="#c0f0c0",
+            selectforeground='#004040',
+            selectbackground='#c0f0c0',
         )
 
-        self._prodlist.pack(side="right", fill="both", expand=1)
+        self._prodlist.pack(side='right', fill='both', expand=1)
 
         self._productions = list(self._parser.grammar().productions())
         for production in self._productions:
-            self._prodlist.insert("end", (" %s" % production))
+            self._prodlist.insert('end', (' %s' % production))
         self._prodlist.config(height=min(len(self._productions), 25))
 
         # Add a scrollbar if there are more than 25 productions.
         if 1:  # len(self._productions) > 25:
-            listscroll = Scrollbar(self._prodframe, orient="vertical")
+            listscroll = Scrollbar(self._prodframe, orient='vertical')
             self._prodlist.config(yscrollcommand=listscroll.set)
             listscroll.config(command=self._prodlist.yview)
-            listscroll.pack(side="left", fill="y")
+            listscroll.pack(side='left', fill='y')
 
         # If they select a production, apply it.
-        self._prodlist.bind("<<ListboxSelect>>", self._prodlist_select)
+        self._prodlist.bind('<<ListboxSelect>>', self._prodlist_select)
 
         # When they hover over a production, highlight it.
         self._hover = -1
-        self._prodlist.bind("<Motion>", self._highlight_hover)
-        self._prodlist.bind("<Leave>", self._clear_hover)
+        self._prodlist.bind('<Motion>', self._highlight_hover)
+        self._prodlist.bind('<Leave>', self._clear_hover)
 
     def _init_bindings(self):
         # Quit
-        self._top.bind("<Control-q>", self.destroy)
-        self._top.bind("<Control-x>", self.destroy)
-        self._top.bind("<Alt-q>", self.destroy)
-        self._top.bind("<Alt-x>", self.destroy)
+        self._top.bind('<Control-q>', self.destroy)
+        self._top.bind('<Control-x>', self.destroy)
+        self._top.bind('<Alt-q>', self.destroy)
+        self._top.bind('<Alt-x>', self.destroy)
 
         # Ops (step, shift, reduce, undo)
-        self._top.bind("<space>", self.step)
-        self._top.bind("<s>", self.shift)
-        self._top.bind("<Alt-s>", self.shift)
-        self._top.bind("<Control-s>", self.shift)
-        self._top.bind("<r>", self.reduce)
-        self._top.bind("<Alt-r>", self.reduce)
-        self._top.bind("<Control-r>", self.reduce)
-        self._top.bind("<Delete>", self.reset)
-        self._top.bind("<u>", self.undo)
-        self._top.bind("<Alt-u>", self.undo)
-        self._top.bind("<Control-u>", self.undo)
-        self._top.bind("<Control-z>", self.undo)
-        self._top.bind("<BackSpace>", self.undo)
+        self._top.bind('<space>', self.step)
+        self._top.bind('<s>', self.shift)
+        self._top.bind('<Alt-s>', self.shift)
+        self._top.bind('<Control-s>', self.shift)
+        self._top.bind('<r>', self.reduce)
+        self._top.bind('<Alt-r>', self.reduce)
+        self._top.bind('<Control-r>', self.reduce)
+        self._top.bind('<Delete>', self.reset)
+        self._top.bind('<u>', self.undo)
+        self._top.bind('<Alt-u>', self.undo)
+        self._top.bind('<Control-u>', self.undo)
+        self._top.bind('<Control-z>', self.undo)
+        self._top.bind('<BackSpace>', self.undo)
 
         # Misc
-        self._top.bind("<Control-p>", self.postscript)
-        self._top.bind("<Control-h>", self.help)
-        self._top.bind("<F1>", self.help)
-        self._top.bind("<Control-g>", self.edit_grammar)
-        self._top.bind("<Control-t>", self.edit_sentence)
+        self._top.bind('<Control-p>', self.postscript)
+        self._top.bind('<Control-h>', self.help)
+        self._top.bind('<F1>', self.help)
+        self._top.bind('<Control-g>', self.edit_grammar)
+        self._top.bind('<Control-t>', self.edit_sentence)
 
         # Animation speed control
-        self._top.bind("-", lambda e, a=self._animate: a.set(20))
-        self._top.bind("=", lambda e, a=self._animate: a.set(10))
-        self._top.bind("+", lambda e, a=self._animate: a.set(4))
+        self._top.bind('-', lambda e, a=self._animate: a.set(20))
+        self._top.bind('=', lambda e, a=self._animate: a.set(10))
+        self._top.bind('+', lambda e, a=self._animate: a.set(4))
 
     def _init_buttons(self, parent):
         # Set up the frames.
         self._buttonframe = buttonframe = Frame(parent)
-        buttonframe.pack(fill="none", side="bottom")
+        buttonframe.pack(fill='none', side='bottom')
         Button(
             buttonframe,
-            text="Step",
-            background="#90c0d0",
-            foreground="black",
+            text='Step',
+            background='#90c0d0',
+            foreground='black',
             command=self.step,
-        ).pack(side="left")
+        ).pack(side='left')
         Button(
             buttonframe,
-            text="Shift",
+            text='Shift',
             underline=0,
-            background="#90f090",
-            foreground="black",
+            background='#90f090',
+            foreground='black',
             command=self.shift,
-        ).pack(side="left")
+        ).pack(side='left')
         Button(
             buttonframe,
-            text="Reduce",
+            text='Reduce',
             underline=0,
-            background="#90f090",
-            foreground="black",
+            background='#90f090',
+            foreground='black',
             command=self.reduce,
-        ).pack(side="left")
+        ).pack(side='left')
         Button(
             buttonframe,
-            text="Undo",
+            text='Undo',
             underline=0,
-            background="#f0a0a0",
-            foreground="black",
+            background='#f0a0a0',
+            foreground='black',
             command=self.undo,
-        ).pack(side="left")
+        ).pack(side='left')
 
     def _init_menubar(self, parent):
         menubar = Menu(parent)
 
         filemenu = Menu(menubar, tearoff=0)
         filemenu.add_command(
-            label="Reset Parser", underline=0, command=self.reset, accelerator="Del"
+            label='Reset Parser', underline=0, command=self.reset, accelerator='Del'
         )
         filemenu.add_command(
-            label="Print to Postscript",
+            label='Print to Postscript',
             underline=0,
             command=self.postscript,
-            accelerator="Ctrl-p",
+            accelerator='Ctrl-p',
         )
         filemenu.add_command(
-            label="Exit", underline=1, command=self.destroy, accelerator="Ctrl-x"
+            label='Exit', underline=1, command=self.destroy, accelerator='Ctrl-x'
         )
-        menubar.add_cascade(label="File", underline=0, menu=filemenu)
+        menubar.add_cascade(label='File', underline=0, menu=filemenu)
 
         editmenu = Menu(menubar, tearoff=0)
         editmenu.add_command(
-            label="Edit Grammar",
+            label='Edit Grammar',
             underline=5,
             command=self.edit_grammar,
-            accelerator="Ctrl-g",
+            accelerator='Ctrl-g',
         )
         editmenu.add_command(
-            label="Edit Text",
+            label='Edit Text',
             underline=5,
             command=self.edit_sentence,
-            accelerator="Ctrl-t",
+            accelerator='Ctrl-t',
         )
-        menubar.add_cascade(label="Edit", underline=0, menu=editmenu)
+        menubar.add_cascade(label='Edit', underline=0, menu=editmenu)
 
         rulemenu = Menu(menubar, tearoff=0)
         rulemenu.add_command(
-            label="Step", underline=1, command=self.step, accelerator="Space"
+            label='Step', underline=1, command=self.step, accelerator='Space'
         )
         rulemenu.add_separator()
         rulemenu.add_command(
-            label="Shift", underline=0, command=self.shift, accelerator="Ctrl-s"
+            label='Shift', underline=0, command=self.shift, accelerator='Ctrl-s'
         )
         rulemenu.add_command(
-            label="Reduce", underline=0, command=self.reduce, accelerator="Ctrl-r"
+            label='Reduce', underline=0, command=self.reduce, accelerator='Ctrl-r'
         )
         rulemenu.add_separator()
         rulemenu.add_command(
-            label="Undo", underline=0, command=self.undo, accelerator="Ctrl-u"
+            label='Undo', underline=0, command=self.undo, accelerator='Ctrl-u'
         )
-        menubar.add_cascade(label="Apply", underline=0, menu=rulemenu)
+        menubar.add_cascade(label='Apply', underline=0, menu=rulemenu)
 
         viewmenu = Menu(menubar, tearoff=0)
         viewmenu.add_checkbutton(
@@ -319,41 +319,41 @@ class ShiftReduceApp(object):
         )
         viewmenu.add_separator()
         viewmenu.add_radiobutton(
-            label="Tiny",
+            label='Tiny',
             variable=self._size,
             underline=0,
             value=10,
             command=self.resize,
         )
         viewmenu.add_radiobutton(
-            label="Small",
+            label='Small',
             variable=self._size,
             underline=0,
             value=12,
             command=self.resize,
         )
         viewmenu.add_radiobutton(
-            label="Medium",
+            label='Medium',
             variable=self._size,
             underline=0,
             value=14,
             command=self.resize,
         )
         viewmenu.add_radiobutton(
-            label="Large",
+            label='Large',
             variable=self._size,
             underline=0,
             value=18,
             command=self.resize,
         )
         viewmenu.add_radiobutton(
-            label="Huge",
+            label='Huge',
             variable=self._size,
             underline=0,
             value=24,
             command=self.resize,
         )
-        menubar.add_cascade(label="View", underline=0, menu=viewmenu)
+        menubar.add_cascade(label='View', underline=0, menu=viewmenu)
 
         animatemenu = Menu(menubar, tearoff=0)
         animatemenu.add_radiobutton(
@@ -364,81 +364,81 @@ class ShiftReduceApp(object):
             underline=0,
             variable=self._animate,
             value=20,
-            accelerator="-",
+            accelerator='-',
         )
         animatemenu.add_radiobutton(
             label="Normal Animation",
             underline=0,
             variable=self._animate,
             value=10,
-            accelerator="=",
+            accelerator='=',
         )
         animatemenu.add_radiobutton(
             label="Fast Animation",
             underline=0,
             variable=self._animate,
             value=4,
-            accelerator="+",
+            accelerator='+',
         )
         menubar.add_cascade(label="Animate", underline=1, menu=animatemenu)
 
         helpmenu = Menu(menubar, tearoff=0)
-        helpmenu.add_command(label="About", underline=0, command=self.about)
+        helpmenu.add_command(label='About', underline=0, command=self.about)
         helpmenu.add_command(
-            label="Instructions", underline=0, command=self.help, accelerator="F1"
+            label='Instructions', underline=0, command=self.help, accelerator='F1'
         )
-        menubar.add_cascade(label="Help", underline=0, menu=helpmenu)
+        menubar.add_cascade(label='Help', underline=0, menu=helpmenu)
 
         parent.config(menu=menubar)
 
     def _init_feedback(self, parent):
         self._feedbackframe = feedbackframe = Frame(parent)
-        feedbackframe.pack(fill="x", side="bottom", padx=3, pady=3)
+        feedbackframe.pack(fill='x', side='bottom', padx=3, pady=3)
         self._lastoper_label = Label(
-            feedbackframe, text="Last Operation:", font=self._font
+            feedbackframe, text='Last Operation:', font=self._font
         )
-        self._lastoper_label.pack(side="left")
-        lastoperframe = Frame(feedbackframe, relief="sunken", border=1)
-        lastoperframe.pack(fill="x", side="right", expand=1, padx=5)
+        self._lastoper_label.pack(side='left')
+        lastoperframe = Frame(feedbackframe, relief='sunken', border=1)
+        lastoperframe.pack(fill='x', side='right', expand=1, padx=5)
         self._lastoper1 = Label(
-            lastoperframe, foreground="#007070", background="#f0f0f0", font=self._font
+            lastoperframe, foreground='#007070', background='#f0f0f0', font=self._font
         )
         self._lastoper2 = Label(
             lastoperframe,
-            anchor="w",
+            anchor='w',
             width=30,
-            foreground="#004040",
-            background="#f0f0f0",
+            foreground='#004040',
+            background='#f0f0f0',
             font=self._font,
         )
-        self._lastoper1.pack(side="left")
-        self._lastoper2.pack(side="left", fill="x", expand=1)
+        self._lastoper1.pack(side='left')
+        self._lastoper2.pack(side='left', fill='x', expand=1)
 
     def _init_canvas(self, parent):
         self._cframe = CanvasFrame(
             parent,
-            background="white",
+            background='white',
             width=525,
             closeenough=10,
             border=2,
-            relief="sunken",
+            relief='sunken',
         )
-        self._cframe.pack(expand=1, fill="both", side="top", pady=2)
+        self._cframe.pack(expand=1, fill='both', side='top', pady=2)
         canvas = self._canvas = self._cframe.canvas()
 
         self._stackwidgets = []
         self._rtextwidgets = []
         self._titlebar = canvas.create_rectangle(
-            0, 0, 0, 0, fill="#c0f0f0", outline="black"
+            0, 0, 0, 0, fill='#c0f0f0', outline='black'
         )
-        self._exprline = canvas.create_line(0, 0, 0, 0, dash=".")
-        self._stacktop = canvas.create_line(0, 0, 0, 0, fill="#408080")
+        self._exprline = canvas.create_line(0, 0, 0, 0, dash='.')
+        self._stacktop = canvas.create_line(0, 0, 0, 0, fill='#408080')
         size = self._size.get() + 4
         self._stacklabel = TextWidget(
-            canvas, "Stack", color="#004040", font=self._boldfont
+            canvas, 'Stack', color='#004040', font=self._boldfont
         )
         self._rtextlabel = TextWidget(
-            canvas, "Remaining Text", color="#004040", font=self._boldfont
+            canvas, 'Remaining Text', color='#004040', font=self._boldfont
         )
         self._cframe.add_widget(self._stacklabel)
         self._cframe.add_widget(self._rtextlabel)
@@ -448,7 +448,7 @@ class ShiftReduceApp(object):
     #########################################
 
     def _redraw(self):
-        scrollregion = self._canvas["scrollregion"].split()
+        scrollregion = self._canvas['scrollregion'].split()
         (cx1, cy1, cx2, cy2) = [int(c) for c in scrollregion]
 
         # Delete the old stack & rtext widgets.
@@ -476,17 +476,17 @@ class ShiftReduceApp(object):
         for tok in self._parser.stack():
             if isinstance(tok, Tree):
                 attribs = {
-                    "tree_color": "#4080a0",
-                    "tree_width": 2,
-                    "node_font": self._boldfont,
-                    "node_color": "#006060",
-                    "leaf_color": "#006060",
-                    "leaf_font": self._font,
+                    'tree_color': '#4080a0',
+                    'tree_width': 2,
+                    'node_font': self._boldfont,
+                    'node_color': '#006060',
+                    'leaf_color': '#006060',
+                    'leaf_font': self._font,
                 }
                 widget = tree_to_treesegment(self._canvas, tok, **attribs)
-                widget.label()["color"] = "#000000"
+                widget.label()['color'] = '#000000'
             else:
-                widget = TextWidget(self._canvas, tok, color="#000000", font=self._font)
+                widget = TextWidget(self._canvas, tok, color='#000000', font=self._font)
             widget.bind_click(self._popup_reduce)
             self._stackwidgets.append(widget)
             self._cframe.add_widget(widget, stackx, y)
@@ -495,7 +495,7 @@ class ShiftReduceApp(object):
         # Draw the remaining text.
         rtextwidth = 0
         for tok in self._parser.remaining_text():
-            widget = TextWidget(self._canvas, tok, color="#000000", font=self._font)
+            widget = TextWidget(self._canvas, tok, color='#000000', font=self._font)
             self._rtextwidgets.append(widget)
             self._cframe.add_widget(widget, rtextwidth, y)
             rtextwidth = widget.bbox()[2] + 4
@@ -541,7 +541,7 @@ class ShiftReduceApp(object):
 
     def _highlight_productions(self):
         # Highlight the productions that can be reduced.
-        self._prodlist.selection_clear(0, "end")
+        self._prodlist.selection_clear(0, 'end')
         for prod in self._parser.reducible_productions():
             index = self._productions.index(prod)
             self._prodlist.selection_set(index)
@@ -558,8 +558,8 @@ class ShiftReduceApp(object):
 
     def reset(self, *e):
         self._parser.initialize(self._sent)
-        self._lastoper1["text"] = "Reset App"
-        self._lastoper2["text"] = ""
+        self._lastoper1['text'] = 'Reset App'
+        self._lastoper2['text'] = ''
         self._redraw()
 
     def step(self, *e):
@@ -569,19 +569,19 @@ class ShiftReduceApp(object):
             return True
         else:
             if list(self._parser.parses()):
-                self._lastoper1["text"] = "Finished:"
-                self._lastoper2["text"] = "Success"
+                self._lastoper1['text'] = 'Finished:'
+                self._lastoper2['text'] = 'Success'
             else:
-                self._lastoper1["text"] = "Finished:"
-                self._lastoper2["text"] = "Failure"
+                self._lastoper1['text'] = 'Finished:'
+                self._lastoper2['text'] = 'Failure'
 
     def shift(self, *e):
         if self._animating_lock:
             return
         if self._parser.shift():
             tok = self._parser.stack()[-1]
-            self._lastoper1["text"] = "Shift:"
-            self._lastoper2["text"] = "%r" % tok
+            self._lastoper1['text'] = 'Shift:'
+            self._lastoper2['text'] = '%r' % tok
             if self._animate.get():
                 self._animate_shift()
             else:
@@ -594,8 +594,8 @@ class ShiftReduceApp(object):
             return
         production = self._parser.reduce()
         if production:
-            self._lastoper1["text"] = "Reduce:"
-            self._lastoper2["text"] = "%s" % production
+            self._lastoper1['text'] = 'Reduce:'
+            self._lastoper2['text'] = '%s' % production
             if self._animate.get():
                 self._animate_reduce()
             else:
@@ -648,24 +648,24 @@ class ShiftReduceApp(object):
         try:
             ShowText(
                 self._top,
-                "Help: Shift-Reduce Parser Application",
-                (__doc__ or "").strip(),
+                'Help: Shift-Reduce Parser Application',
+                (__doc__ or '').strip(),
                 width=75,
-                font="fixed",
+                font='fixed',
             )
         except:
             ShowText(
                 self._top,
-                "Help: Shift-Reduce Parser Application",
-                (__doc__ or "").strip(),
+                'Help: Shift-Reduce Parser Application',
+                (__doc__ or '').strip(),
                 width=75,
             )
 
     def about(self, *e):
         ABOUT = "NLTK Shift-Reduce Parser Application\n" + "Written by Edward Loper"
-        TITLE = "About: Shift-Reduce Parser Application"
+        TITLE = 'About: Shift-Reduce Parser Application'
         try:
-            from tkinter.messagebox import Message
+            from six.moves.tkinter_messagebox import Message
 
             Message(message=ABOUT, title=TITLE).show()
         except:
@@ -677,14 +677,14 @@ class ShiftReduceApp(object):
     def set_grammar(self, grammar):
         self._parser.set_grammar(grammar)
         self._productions = list(grammar.productions())
-        self._prodlist.delete(0, "end")
+        self._prodlist.delete(0, 'end')
         for production in self._productions:
-            self._prodlist.insert("end", (" %s" % production))
+            self._prodlist.insert('end', (' %s' % production))
 
     def edit_sentence(self, *e):
         sentence = " ".join(self._sent)
-        title = "Edit Text"
-        instr = "Enter a new sentence to parse."
+        title = 'Edit Text'
+        instr = 'Enter a new sentence to parse.'
         EntryDialog(self._top, sentence, instr, self.set_sentence, title)
 
     def set_sentence(self, sent):
@@ -698,13 +698,13 @@ class ShiftReduceApp(object):
     def _toggle_grammar(self, *e):
         if self._show_grammar.get():
             self._prodframe.pack(
-                fill="both", side="left", padx=2, after=self._feedbackframe
+                fill='both', side='left', padx=2, after=self._feedbackframe
             )
-            self._lastoper1["text"] = "Show Grammar"
+            self._lastoper1['text'] = 'Show Grammar'
         else:
             self._prodframe.pack_forget()
-            self._lastoper1["text"] = "Hide Grammar"
-        self._lastoper2["text"] = ""
+            self._lastoper1['text'] = 'Hide Grammar'
+        self._lastoper2['text'] = ''
 
     def _prodlist_select(self, event):
         selection = self._prodlist.curselection()
@@ -713,15 +713,15 @@ class ShiftReduceApp(object):
         index = int(selection[0])
         production = self._parser.reduce(self._productions[index])
         if production:
-            self._lastoper1["text"] = "Reduce:"
-            self._lastoper2["text"] = "%s" % production
+            self._lastoper1['text'] = 'Reduce:'
+            self._lastoper2['text'] = '%s' % production
             if self._animate.get():
                 self._animate_reduce()
             else:
                 self._redraw()
         else:
             # Reset the production selections.
-            self._prodlist.selection_clear(0, "end")
+            self._prodlist.selection_clear(0, 'end')
             for prod in self._parser.reducible_productions():
                 index = self._productions.index(prod)
                 self._prodlist.selection_set(index)
@@ -732,7 +732,7 @@ class ShiftReduceApp(object):
         if len(productions) == 0:
             return
 
-        self._reduce_menu.delete(0, "end")
+        self._reduce_menu.delete(0, 'end')
         for production in productions:
             self._reduce_menu.add_command(label=str(production), command=self.reduce)
         self._reduce_menu.post(
@@ -806,7 +806,7 @@ class ShiftReduceApp(object):
             if not isinstance(tok, Tree):
                 raise ValueError()
             label = TextWidget(
-                self._canvas, str(tok.label()), color="#006060", font=self._boldfont
+                self._canvas, str(tok.label()), color='#006060', font=self._boldfont
             )
             widget = TreeSegmentWidget(self._canvas, label, widgets, width=2)
             (x1, y1, x2, y2) = self._stacklabel.bbox()
@@ -870,9 +870,9 @@ class ShiftReduceApp(object):
             rhslen = len(self._productions[index].rhs())
             for stackwidget in self._stackwidgets[-rhslen:]:
                 if isinstance(stackwidget, TreeSegmentWidget):
-                    stackwidget.label()["color"] = "#00a000"
+                    stackwidget.label()['color'] = '#00a000'
                 else:
-                    stackwidget["color"] = "#00a000"
+                    stackwidget['color'] = '#00a000'
 
         # Remember what production we're hovering over.
         self._hover = index
@@ -884,9 +884,9 @@ class ShiftReduceApp(object):
         self._hover = -1
         for stackwidget in self._stackwidgets:
             if isinstance(stackwidget, TreeSegmentWidget):
-                stackwidget.label()["color"] = "black"
+                stackwidget.label()['color'] = 'black'
             else:
-                stackwidget["color"] = "black"
+                stackwidget['color'] = 'black'
 
 
 def app():
@@ -897,7 +897,7 @@ def app():
 
     from nltk.grammar import Nonterminal, Production, CFG
 
-    nonterminals = "S VP NP PP P N Name V Det"
+    nonterminals = 'S VP NP PP P N Name V Det'
     (S, VP, NP, PP, P, N, Name, V, Det) = [Nonterminal(s) for s in nonterminals.split()]
 
     productions = (
@@ -910,28 +910,28 @@ def app():
         Production(VP, [V, NP]),
         Production(PP, [P, NP]),
         # Lexical Productions
-        Production(NP, ["I"]),
-        Production(Det, ["the"]),
-        Production(Det, ["a"]),
-        Production(N, ["man"]),
-        Production(V, ["saw"]),
-        Production(P, ["in"]),
-        Production(P, ["with"]),
-        Production(N, ["park"]),
-        Production(N, ["dog"]),
-        Production(N, ["statue"]),
-        Production(Det, ["my"]),
+        Production(NP, ['I']),
+        Production(Det, ['the']),
+        Production(Det, ['a']),
+        Production(N, ['man']),
+        Production(V, ['saw']),
+        Production(P, ['in']),
+        Production(P, ['with']),
+        Production(N, ['park']),
+        Production(N, ['dog']),
+        Production(N, ['statue']),
+        Production(Det, ['my']),
     )
 
     grammar = CFG(S, productions)
 
     # tokenize the sentence
-    sent = "my dog saw a man in the park with a statue".split()
+    sent = 'my dog saw a man in the park with a statue'.split()
 
     ShiftReduceApp(grammar, sent).mainloop()
 
 
-if __name__ == "__main__":
+if __name__ == '__main__':
     app()
 
-__all__ = ["app"]
+__all__ = ['app']
index 522139c..52c7c66 100644 (file)
@@ -1,6 +1,6 @@
 # Natural Language Toolkit: Wordfreq Application
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Sumukh Ghodke <sghodke@csse.unimelb.edu.au>
 # URL: <http://nltk.org/>
 # For license information, see LICENSE.TXT
@@ -25,11 +25,11 @@ def plot_word_freq_dist(text):
 
 
 def app():
-    t1 = Text(gutenberg.words("melville-moby_dick.txt"))
+    t1 = Text(gutenberg.words('melville-moby_dick.txt'))
     plot_word_freq_dist(t1)
 
 
-if __name__ == "__main__":
+if __name__ == '__main__':
     app()
 
-__all__ = ["app"]
+__all__ = ['app']
index f31000a..9854955 100644 (file)
@@ -1,6 +1,6 @@
 # Natural Language Toolkit: WordNet Browser Application
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Jussi Salmela <jtsalmela@users.sourceforge.net>
 #         Paul Bone <pbone@students.csse.unimelb.edu.au>
 # URL: <http://nltk.org/>
@@ -44,6 +44,7 @@ Options::
 # modifying to be compliant with NLTK's coding standards.  Tests also
 # need to be develop to ensure this continues to work in the face of
 # changes to other NLTK packages.
+from __future__ import print_function
 
 # Allow this program to run inside the NLTK source tree.
 from sys import path
@@ -61,12 +62,18 @@ import getopt
 import base64
 import pickle
 import copy
-from http.server import HTTPServer, BaseHTTPRequestHandler
-from urllib.parse import unquote_plus
 
+from six.moves.urllib.parse import unquote_plus
+
+from nltk import compat
 from nltk.corpus import wordnet as wn
 from nltk.corpus.reader.wordnet import Synset, Lemma
 
+if compat.PY3:
+    from http.server import HTTPServer, BaseHTTPRequestHandler
+else:
+    from BaseHTTPServer import HTTPServer, BaseHTTPRequestHandler
+
 # now included in local file
 # from util import html_header, html_trailer, \
 #    get_static_index_page, get_static_page_by_path, \
@@ -89,40 +96,40 @@ class MyServerHandler(BaseHTTPRequestHandler):
     def do_GET(self):
         global firstClient
         sp = self.path[1:]
-        if unquote_plus(sp) == "SHUTDOWN THE SERVER":
+        if unquote_plus(sp) == 'SHUTDOWN THE SERVER':
             if server_mode:
                 page = "Server must be killed with SIGTERM."
                 type = "text/plain"
             else:
-                print("Server shutting down!")
+                print('Server shutting down!')
                 os._exit(0)
 
-        elif sp == "":  # First request.
-            type = "text/html"
+        elif sp == '':  # First request.
+            type = 'text/html'
             if not server_mode and firstClient:
                 firstClient = False
                 page = get_static_index_page(True)
             else:
                 page = get_static_index_page(False)
-            word = "green"
+            word = 'green'
 
-        elif sp.endswith(".html"):  # Trying to fetch a HTML file TODO:
-            type = "text/html"
+        elif sp.endswith('.html'):  # Trying to fetch a HTML file TODO:
+            type = 'text/html'
             usp = unquote_plus(sp)
-            if usp == "NLTK Wordnet Browser Database Info.html":
-                word = "* Database Info *"
+            if usp == 'NLTK Wordnet Browser Database Info.html':
+                word = '* Database Info *'
                 if os.path.isfile(usp):
-                    with open(usp, "r") as infile:
+                    with open(usp, 'r') as infile:
                         page = infile.read()
                 else:
                     page = (
-                        (html_header % word) + "<p>The database info file:"
-                        "<p><b>"
+                        (html_header % word) + '<p>The database info file:'
+                        '<p><b>'
                         + usp
-                        + "</b>"
-                        + "<p>was not found. Run this:"
-                        + "<p><b>python dbinfo_html.py</b>"
-                        + "<p>to produce it."
+                        + '</b>'
+                        + '<p>was not found. Run this:'
+                        + '<p><b>python dbinfo_html.py</b>'
+                        + '<p>to produce it.'
                         + html_trailer
                     )
             else:
@@ -131,7 +138,7 @@ class MyServerHandler(BaseHTTPRequestHandler):
                 page = get_static_page_by_path(usp)
         elif sp.startswith("search"):
             # This doesn't seem to work with MWEs.
-            type = "text/html"
+            type = 'text/html'
             parts = (sp.split("?")[1]).split("&")
             word = [
                 p.split("=")[1].replace("+", " ")
@@ -141,25 +148,25 @@ class MyServerHandler(BaseHTTPRequestHandler):
             page, word = page_from_word(word)
         elif sp.startswith("lookup_"):
             # TODO add a variation of this that takes a non ecoded word or MWE.
-            type = "text/html"
+            type = 'text/html'
             sp = sp[len("lookup_") :]
             page, word = page_from_href(sp)
         elif sp == "start_page":
             # if this is the first request we should display help
             # information, and possibly set a default word.
-            type = "text/html"
+            type = 'text/html'
             page, word = page_from_word("wordnet")
         else:
-            type = "text/plain"
+            type = 'text/plain'
             page = "Could not parse request: '%s'" % sp
 
         # Send result.
         self.send_head(type)
-        self.wfile.write(page.encode("utf8"))
+        self.wfile.write(page.encode('utf8'))
 
     def send_head(self, type=None):
         self.send_response(200)
-        self.send_header("Content-type", type)
+        self.send_header('Content-type', type)
         self.end_headers()
 
     def log_message(self, format, *args):
@@ -177,7 +184,7 @@ def get_unique_counter_from_url(sp):
     Extract the unique counter from the URL if it has one.  Otherwise return
     null.
     """
-    pos = sp.rfind("%23")
+    pos = sp.rfind('%23')
     if pos != -1:
         return int(sp[(pos + 3) :])
     else:
@@ -227,7 +234,7 @@ def wnb(port=8000, runBrowser=True, logfilename=None):
         logfile = None
 
     # Compute URL and start web browser
-    url = "http://localhost:" + str(port)
+    url = 'http://localhost:' + str(port)
 
     server_ready = None
     browser_thread = None
@@ -237,9 +244,9 @@ def wnb(port=8000, runBrowser=True, logfilename=None):
         browser_thread = startBrowser(url, server_ready)
 
     # Start the server.
-    server = HTTPServer(("", port), MyServerHandler)
+    server = HTTPServer(('', port), MyServerHandler)
     if logfile:
-        logfile.write("NLTK Wordnet browser server running serving: %s\n" % url)
+        logfile.write('NLTK Wordnet browser server running serving: %s\n' % url)
     if runBrowser:
         server_ready.set()
 
@@ -287,10 +294,10 @@ This provides a backend to both wxbrowse and browserver.py.
 # WordNet corpus is installed.
 def _pos_tuples():
     return [
-        (wn.NOUN, "N", "noun"),
-        (wn.VERB, "V", "verb"),
-        (wn.ADJ, "J", "adj"),
-        (wn.ADV, "R", "adv"),
+        (wn.NOUN, 'N', 'noun'),
+        (wn.VERB, 'V', 'verb'),
+        (wn.ADJ, 'J', 'adj'),
+        (wn.ADV, 'R', 'adv'),
     ]
 
 
@@ -300,8 +307,8 @@ def _pos_match(pos_tuple):
     tuple given to it.  It attempts to match it against the first
     non-null component of the given pos tuple.
     """
-    if pos_tuple[0] == "s":
-        pos_tuple = ("a", pos_tuple[1], pos_tuple[2])
+    if pos_tuple[0] == 's':
+        pos_tuple = ('a', pos_tuple[1], pos_tuple[2])
     for n, x in enumerate(pos_tuple):
         if x is not None:
             break
@@ -366,24 +373,24 @@ def get_relations_data(word, synset):
     """
     if synset.pos() == wn.NOUN:
         return (
-            (HYPONYM, "Hyponyms", synset.hyponyms()),
-            (INSTANCE_HYPONYM, "Instance hyponyms", synset.instance_hyponyms()),
-            (HYPERNYM, "Direct hypernyms", synset.hypernyms()),
+            (HYPONYM, 'Hyponyms', synset.hyponyms()),
+            (INSTANCE_HYPONYM, 'Instance hyponyms', synset.instance_hyponyms()),
+            (HYPERNYM, 'Direct hypernyms', synset.hypernyms()),
             (
                 INDIRECT_HYPERNYMS,
-                "Indirect hypernyms",
+                'Indirect hypernyms',
                 rebuild_tree(synset.tree(lambda x: x.hypernyms()))[1],
             ),
             #  hypernyms', 'Sister terms',
-            (INSTANCE_HYPERNYM, "Instance hypernyms", synset.instance_hypernyms()),
+            (INSTANCE_HYPERNYM, 'Instance hypernyms', synset.instance_hypernyms()),
             #            (CLASS_REGIONAL, ['domain term region'], ),
-            (PART_HOLONYM, "Part holonyms", synset.part_holonyms()),
-            (PART_MERONYM, "Part meronyms", synset.part_meronyms()),
-            (SUBSTANCE_HOLONYM, "Substance holonyms", synset.substance_holonyms()),
-            (SUBSTANCE_MERONYM, "Substance meronyms", synset.substance_meronyms()),
-            (MEMBER_HOLONYM, "Member holonyms", synset.member_holonyms()),
-            (MEMBER_MERONYM, "Member meronyms", synset.member_meronyms()),
-            (ATTRIBUTE, "Attributes", synset.attributes()),
+            (PART_HOLONYM, 'Part holonyms', synset.part_holonyms()),
+            (PART_MERONYM, 'Part meronyms', synset.part_meronyms()),
+            (SUBSTANCE_HOLONYM, 'Substance holonyms', synset.substance_holonyms()),
+            (SUBSTANCE_MERONYM, 'Substance meronyms', synset.substance_meronyms()),
+            (MEMBER_HOLONYM, 'Member holonyms', synset.member_holonyms()),
+            (MEMBER_MERONYM, 'Member meronyms', synset.member_meronyms()),
+            (ATTRIBUTE, 'Attributes', synset.attributes()),
             (ANTONYM, "Antonyms", lemma_property(word, synset, lambda l: l.antonyms())),
             (
                 DERIVATIONALLY_RELATED_FORM,
@@ -395,18 +402,18 @@ def get_relations_data(word, synset):
         )
     elif synset.pos() == wn.VERB:
         return (
-            (ANTONYM, "Antonym", lemma_property(word, synset, lambda l: l.antonyms())),
-            (HYPONYM, "Hyponym", synset.hyponyms()),
-            (HYPERNYM, "Direct hypernyms", synset.hypernyms()),
+            (ANTONYM, 'Antonym', lemma_property(word, synset, lambda l: l.antonyms())),
+            (HYPONYM, 'Hyponym', synset.hyponyms()),
+            (HYPERNYM, 'Direct hypernyms', synset.hypernyms()),
             (
                 INDIRECT_HYPERNYMS,
-                "Indirect hypernyms",
+                'Indirect hypernyms',
                 rebuild_tree(synset.tree(lambda x: x.hypernyms()))[1],
             ),
-            (ENTAILMENT, "Entailments", synset.entailments()),
-            (CAUSE, "Causes", synset.causes()),
-            (ALSO_SEE, "Also see", synset.also_sees()),
-            (VERB_GROUP, "Verb Groups", synset.verb_groups()),
+            (ENTAILMENT, 'Entailments', synset.entailments()),
+            (CAUSE, 'Causes', synset.causes()),
+            (ALSO_SEE, 'Also see', synset.also_sees()),
+            (VERB_GROUP, 'Verb Groups', synset.verb_groups()),
             (
                 DERIVATIONALLY_RELATED_FORM,
                 "Derivationally related form",
@@ -417,29 +424,29 @@ def get_relations_data(word, synset):
         )
     elif synset.pos() == wn.ADJ or synset.pos == wn.ADJ_SAT:
         return (
-            (ANTONYM, "Antonym", lemma_property(word, synset, lambda l: l.antonyms())),
-            (SIMILAR, "Similar to", synset.similar_tos()),
+            (ANTONYM, 'Antonym', lemma_property(word, synset, lambda l: l.antonyms())),
+            (SIMILAR, 'Similar to', synset.similar_tos()),
             # Participle of verb - not supported by corpus
             (
                 PERTAINYM,
-                "Pertainyms",
+                'Pertainyms',
                 lemma_property(word, synset, lambda l: l.pertainyms()),
             ),
-            (ATTRIBUTE, "Attributes", synset.attributes()),
-            (ALSO_SEE, "Also see", synset.also_sees()),
+            (ATTRIBUTE, 'Attributes', synset.attributes()),
+            (ALSO_SEE, 'Also see', synset.also_sees()),
         )
     elif synset.pos() == wn.ADV:
         # This is weird. adverbs such as 'quick' and 'fast' don't seem
         # to have antonyms returned by the corpus.a
         return (
-            (ANTONYM, "Antonym", lemma_property(word, synset, lambda l: l.antonyms())),
+            (ANTONYM, 'Antonym', lemma_property(word, synset, lambda l: l.antonyms())),
         )
         # Derived from adjective - not supported by corpus
     else:
         raise TypeError("Unhandles synset POS type: " + str(synset.pos()))
 
 
-html_header = """
+html_header = '''
 <!DOCTYPE html PUBLIC '-//W3C//DTD HTML 4.01//EN'
 'http://www.w3.org/TR/html4/strict.dtd'>
 <html>
@@ -450,13 +457,13 @@ html_header = """
 'text/html; charset=us-ascii'>
 <title>NLTK Wordnet Browser display of: %s</title></head>
 <body bgcolor='#F5F5F5' text='#000000'>
-"""
-html_trailer = """
+'''
+html_trailer = '''
 </body>
 </html>
-"""
+'''
 
-explanation = """
+explanation = '''
 <h3>Search Help</h3>
 <ul><li>The display below the line is an example of the output the browser
 shows you when you enter a search word. The search word was <b>green</b>.</li>
@@ -475,33 +482,33 @@ synsets.</li>
 <b>Enter/Return</b> key or click the <b>Search</b> button.</li>
 </ul>
 <hr width='100%'>
-"""
+'''
 
 # HTML oriented functions
 
 
 def _bold(txt):
-    return "<b>%s</b>" % txt
+    return '<b>%s</b>' % txt
 
 
 def _center(txt):
-    return "<center>%s</center>" % txt
+    return '<center>%s</center>' % txt
 
 
 def _hlev(n, txt):
-    return "<h%d>%s</h%d>" % (n, txt, n)
+    return '<h%d>%s</h%d>' % (n, txt, n)
 
 
 def _italic(txt):
-    return "<i>%s</i>" % txt
+    return '<i>%s</i>' % txt
 
 
 def _li(txt):
-    return "<li>%s</li>" % txt
+    return '<li>%s</li>' % txt
 
 
 def pg(word, body):
-    """
+    '''
     Return a HTML page of NLTK Browser format constructed from the
     word and body
 
@@ -511,22 +518,22 @@ def pg(word, body):
     :type body: str
     :return: a HTML page for the word-body combination
     :rtype: str
-    """
+    '''
     return (html_header % word) + body + html_trailer
 
 
 def _ul(txt):
-    return "<ul>" + txt + "</ul>"
+    return '<ul>' + txt + '</ul>'
 
 
 def _abbc(txt):
     """
     abbc = asterisks, breaks, bold, center
     """
-    return _center(_bold("<br>" * 10 + "*" * 10 + " " + txt + " " + "*" * 10))
+    return _center(_bold('<br>' * 10 + '*' * 10 + ' ' + txt + ' ' + '*' * 10))
 
 
-full_hyponym_cont_text = _ul(_li(_italic("(has full hyponym continuation)"))) + "\n"
+full_hyponym_cont_text = _ul(_li(_italic('(has full hyponym continuation)'))) + '\n'
 
 
 def _get_synset(synset_key):
@@ -538,7 +545,7 @@ def _get_synset(synset_key):
 
 
 def _collect_one_synset(word, synset, synset_relations):
-    """
+    '''
     Returns the HTML string for one synset or word
 
     :param word: the current word
@@ -550,11 +557,11 @@ def _collect_one_synset(word, synset, synset_relations):
     :type synset_relations: dict(synset_key, set(relation_id))
     :return: The HTML string built for this synset
     :rtype: str
-    """
+    '''
     if isinstance(synset, tuple):  # It's a word
         raise NotImplementedError("word not supported by _collect_one_synset")
 
-    typ = "S"
+    typ = 'S'
     pos_tuple = _pos_match((synset.pos(), None, None))
     assert pos_tuple is not None, "pos_tuple is null: synset.pos(): %s" % synset.pos()
     descr = pos_tuple[2]
@@ -563,23 +570,23 @@ def _collect_one_synset(word, synset, synset_relations):
     synset_label = typ + ";"
     if synset.name() in synset_relations:
         synset_label = _bold(synset_label)
-    s = "<li>%s (%s) " % (make_lookup_link(ref, synset_label), descr)
+    s = '<li>%s (%s) ' % (make_lookup_link(ref, synset_label), descr)
 
     def format_lemma(w):
-        w = w.replace("_", " ")
+        w = w.replace('_', ' ')
         if w.lower() == word:
             return _bold(w)
         else:
             ref = Reference(w)
             return make_lookup_link(ref, w)
 
-    s += ", ".join(format_lemma(l.name()) for l in synset.lemmas())
+    s += ', '.join(format_lemma(l.name()) for l in synset.lemmas())
 
     gl = " (%s) <i>%s</i> " % (
         synset.definition(),
-        "; ".join('"%s"' % e for e in synset.examples()),
+        "; ".join("\"%s\"" % e for e in synset.examples()),
     )
-    return s + gl + _synset_relations(word, synset, synset_relations) + "</li>\n"
+    return s + gl + _synset_relations(word, synset, synset_relations) + '</li>\n'
 
 
 def _collect_all_synsets(word, pos, synset_relations=dict()):
@@ -587,7 +594,7 @@ def _collect_all_synsets(word, pos, synset_relations=dict()):
     Return a HTML unordered list of synsets for the given word and
     part of speech.
     """
-    return "<ul>%s\n</ul>\n" % "".join(
+    return '<ul>%s\n</ul>\n' % ''.join(
         (
             _collect_one_synset(word, synset, synset_relations)
             for synset in wn.synsets(word, pos)
@@ -596,7 +603,7 @@ def _collect_all_synsets(word, pos, synset_relations=dict()):
 
 
 def _synset_relations(word, synset, synset_relations):
-    """
+    '''
     Builds the HTML string for the relations of a synset
 
     :param word: The current word
@@ -607,7 +614,7 @@ def _synset_relations(word, synset, synset_relations):
     :type synset_relations: dict(synset_key, set(relation_type))
     :return: The HTML for a synset's relations
     :rtype: str
-    """
+    '''
 
     if not synset.name() in synset_relations:
         return ""
@@ -623,7 +630,7 @@ def _synset_relations(word, synset, synset_relations):
             # similar tuples.  This forms a tree of synsets.
             return "%s\n<ul>%s</ul>\n" % (
                 relation_html(r[0]),
-                "".join("<li>%s</li>\n" % relation_html(sr) for sr in r[1]),
+                ''.join('<li>%s</li>\n' % relation_html(sr) for sr in r[1]),
             )
         else:
             raise TypeError(
@@ -632,28 +639,28 @@ def _synset_relations(word, synset, synset_relations):
             )
 
     def make_synset_html(db_name, disp_name, rels):
-        synset_html = "<i>%s</i>\n" % make_lookup_link(
+        synset_html = '<i>%s</i>\n' % make_lookup_link(
             copy.deepcopy(ref).toggle_synset_relation(synset, db_name).encode(),
             disp_name,
         )
 
         if db_name in ref.synset_relations[synset.name()]:
-            synset_html += "<ul>%s</ul>\n" % "".join(
+            synset_html += '<ul>%s</ul>\n' % ''.join(
                 "<li>%s</li>\n" % relation_html(r) for r in rels
             )
 
         return synset_html
 
     html = (
-        "<ul>"
-        + "\n".join(
+        '<ul>'
+        + '\n'.join(
             (
                 "<li>%s</li>" % make_synset_html(*rel_data)
                 for rel_data in get_relations_data(word, synset)
                 if rel_data[2] != []
             )
         )
-        + "</ul>"
+        + '</ul>'
     )
 
     return html
@@ -743,7 +750,7 @@ def page_from_word(word):
 
 
 def page_from_href(href):
-    """
+    '''
     Returns a tuple of the HTML page built and the new current word
 
     :param href: The hypertext reference to be solved
@@ -752,12 +759,12 @@ def page_from_href(href):
              to be sent to the browser and
              word is the new current word
     :rtype: A tuple (str,str)
-    """
+    '''
     return page_from_reference(Reference.decode(href))
 
 
 def page_from_reference(href):
-    """
+    '''
     Returns a tuple of the HTML page built and the new current word
 
     :param href: The hypertext reference to be solved
@@ -766,11 +773,11 @@ def page_from_reference(href):
              to be sent to the browser and
              word is the new current word
     :rtype: A tuple (str,str)
-    """
+    '''
     word = href.word
     pos_forms = defaultdict(list)
-    words = word.split(",")
-    words = [w for w in [w.strip().lower().replace(" ", "_") for w in words] if w != ""]
+    words = word.split(',')
+    words = [w for w in [w.strip().lower().replace(' ', '_') for w in words] if w != ""]
     if len(words) == 0:
         # No words were found.
         return "", "Please specify a word to search for."
@@ -782,10 +789,10 @@ def page_from_reference(href):
             form = wn.morphy(w, pos)
             if form and form not in pos_forms[pos]:
                 pos_forms[pos].append(form)
-    body = ""
+    body = ''
     for pos, pos_str, name in _pos_tuples():
         if pos in pos_forms:
-            body += _hlev(3, name) + "\n"
+            body += _hlev(3, name) + '\n'
             for w in pos_forms[pos]:
                 # Not all words of exc files are in the database, skip
                 # to the next word if a KeyError is raised.
@@ -833,7 +840,7 @@ def get_static_web_help_page():
 <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">
 <html>
      <!-- Natural Language Toolkit: Wordnet Interface: Graphical Wordnet Browser
-            Copyright (C) 2001-2020 NLTK Project
+            Copyright (C) 2001-2019 NLTK Project
             Author: Jussi Salmela <jtsalmela@users.sourceforge.net>
             URL: <http://nltk.org/>
             For license information, see LICENSE.TXT -->
@@ -903,7 +910,7 @@ def get_static_index_page(with_shutdown):
 <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Frameset//EN"  "http://www.w3.org/TR/html4/frameset.dtd">
 <HTML>
      <!-- Natural Language Toolkit: Wordnet Interface: Graphical Wordnet Browser
-            Copyright (C) 2001-2020 NLTK Project
+            Copyright (C) 2001-2019 NLTK Project
             Author: Jussi Salmela <jtsalmela@users.sourceforge.net>
             URL: <http://nltk.org/>
             For license information, see LICENSE.TXT -->
@@ -936,7 +943,7 @@ def get_static_upper_page(with_shutdown):
 <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">
 <html>
     <!-- Natural Language Toolkit: Wordnet Interface: Graphical Wordnet Browser
-        Copyright (C) 2001-2020 NLTK Project
+        Copyright (C) 2001-2019 NLTK Project
         Author: Jussi Salmela <jtsalmela@users.sourceforge.net>
         URL: <http://nltk.org/>
         For license information, see LICENSE.TXT -->
@@ -957,7 +964,7 @@ def get_static_upper_page(with_shutdown):
 </html>
 """
     if with_shutdown:
-        shutdown_link = '<a href="SHUTDOWN THE SERVER">Shutdown</a>'
+        shutdown_link = "<a href=\"SHUTDOWN THE SERVER\">Shutdown</a>"
     else:
         shutdown_link = ""
 
@@ -996,7 +1003,7 @@ def app():
         wnb(port, not server_mode, logfilename)
 
 
-if __name__ == "__main__":
+if __name__ == '__main__':
     app()
 
-__all__ = ["app"]
+__all__ = ['app']
index 0098bed..e130ecd 100644 (file)
@@ -1,10 +1,11 @@
 # Natural Language Toolkit: Some texts for exploration in chapter 1 of the book
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Steven Bird <stevenbird1@gmail.com>
 #
 # URL: <http://nltk.org/>
 # For license information, see LICENSE.TXT
+from __future__ import print_function
 
 from nltk.corpus import (
     gutenberg,
@@ -24,13 +25,13 @@ print("Loading text1, ..., text9 and sent1, ..., sent9")
 print("Type the name of the text or sentence to view it.")
 print("Type: 'texts()' or 'sents()' to list the materials.")
 
-text1 = Text(gutenberg.words("melville-moby_dick.txt"))
+text1 = Text(gutenberg.words('melville-moby_dick.txt'))
 print("text1:", text1.name)
 
-text2 = Text(gutenberg.words("austen-sense.txt"))
+text2 = Text(gutenberg.words('austen-sense.txt'))
 print("text2:", text2.name)
 
-text3 = Text(genesis.words("english-kjv.txt"), name="The Book of Genesis")
+text3 = Text(genesis.words('english-kjv.txt'), name="The Book of Genesis")
 print("text3:", text3.name)
 
 text4 = Text(inaugural.words(), name="Inaugural Address Corpus")
@@ -39,16 +40,16 @@ print("text4:", text4.name)
 text5 = Text(nps_chat.words(), name="Chat Corpus")
 print("text5:", text5.name)
 
-text6 = Text(webtext.words("grail.txt"), name="Monty Python and the Holy Grail")
+text6 = Text(webtext.words('grail.txt'), name="Monty Python and the Holy Grail")
 print("text6:", text6.name)
 
 text7 = Text(treebank.words(), name="Wall Street Journal")
 print("text7:", text7.name)
 
-text8 = Text(webtext.words("singles.txt"), name="Personals Corpus")
+text8 = Text(webtext.words('singles.txt'), name="Personals Corpus")
 print("text8:", text8.name)
 
-text9 = Text(gutenberg.words("chesterton-thursday.txt"))
+text9 = Text(gutenberg.words('chesterton-thursday.txt'))
 print("text9:", text9.name)
 
 
@@ -120,23 +121,23 @@ sent5 = [
     "JOIN",
 ]
 sent6 = [
-    "SCENE",
-    "1",
-    ":",
-    "[",
-    "wind",
-    "]",
-    "[",
-    "clop",
-    "clop",
-    "clop",
-    "]",
-    "KING",
-    "ARTHUR",
-    ":",
-    "Whoa",
-    "there",
-    "!",
+    'SCENE',
+    '1',
+    ':',
+    '[',
+    'wind',
+    ']',
+    '[',
+    'clop',
+    'clop',
+    'clop',
+    ']',
+    'KING',
+    'ARTHUR',
+    ':',
+    'Whoa',
+    'there',
+    '!',
 ]
 sent7 = [
     "Pierre",
@@ -159,20 +160,20 @@ sent7 = [
     ".",
 ]
 sent8 = [
-    "25",
-    "SEXY",
-    "MALE",
-    ",",
-    "seeks",
-    "attrac",
-    "older",
-    "single",
-    "lady",
-    ",",
-    "for",
-    "discreet",
-    "encounters",
-    ".",
+    '25',
+    'SEXY',
+    'MALE',
+    ',',
+    'seeks',
+    'attrac',
+    'older',
+    'single',
+    'lady',
+    ',',
+    'for',
+    'discreet',
+    'encounters',
+    '.',
 ]
 sent9 = [
     "THE",
index 7d54311..40515aa 100644 (file)
@@ -1,6 +1,6 @@
 # Natural Language Toolkit: Combinatory Categorial Grammar
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Graeme Gange <ggange@csse.unimelb.edu.au>
 # URL: <http://nltk.org/>
 # For license information, see LICENSE.TXT
index 10662e1..b69809c 100644 (file)
Binary files a/nlp_resource_data/nltk/ccg/__pycache__/__init__.cpython-37.pyc and b/nlp_resource_data/nltk/ccg/__pycache__/__init__.cpython-37.pyc differ
index 25cb42d..6ed9de1 100644 (file)
Binary files a/nlp_resource_data/nltk/ccg/__pycache__/api.cpython-37.pyc and b/nlp_resource_data/nltk/ccg/__pycache__/api.cpython-37.pyc differ
index e610180..884151e 100644 (file)
Binary files a/nlp_resource_data/nltk/ccg/__pycache__/chart.cpython-37.pyc and b/nlp_resource_data/nltk/ccg/__pycache__/chart.cpython-37.pyc differ
index 0a30b63..3a832e7 100644 (file)
Binary files a/nlp_resource_data/nltk/ccg/__pycache__/combinator.cpython-37.pyc and b/nlp_resource_data/nltk/ccg/__pycache__/combinator.cpython-37.pyc differ
index bd6b1bd..55e1a29 100644 (file)
Binary files a/nlp_resource_data/nltk/ccg/__pycache__/lexicon.cpython-37.pyc and b/nlp_resource_data/nltk/ccg/__pycache__/lexicon.cpython-37.pyc differ
index df6e87b..273740f 100644 (file)
Binary files a/nlp_resource_data/nltk/ccg/__pycache__/logic.cpython-37.pyc and b/nlp_resource_data/nltk/ccg/__pycache__/logic.cpython-37.pyc differ
index 6278452..7173ea0 100644 (file)
@@ -1,21 +1,25 @@
 # Natural Language Toolkit: CCG Categories
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Graeme Gange <ggange@csse.unimelb.edu.au>
 # URL: <http://nltk.org/>
 # For license information, see LICENSE.TXT
-
+from __future__ import unicode_literals
 from functools import total_ordering
 
 from abc import ABCMeta, abstractmethod
+from six import add_metaclass
 
 from nltk.internals import raise_unorderable_types
+from nltk.compat import python_2_unicode_compatible, unicode_repr
+
 
+@add_metaclass(ABCMeta)
 @total_ordering
-class AbstractCCGCategory(metaclass=ABCMeta):
-    """
+class AbstractCCGCategory(object):
+    '''
     Interface for categories in combinatory grammars.
-    """
+    '''
 
     @abstractmethod
     def is_primitive(self):
@@ -80,12 +84,13 @@ class AbstractCCGCategory(metaclass=ABCMeta):
             return self._hash
 
 
+@python_2_unicode_compatible
 class CCGVar(AbstractCCGCategory):
-    """
+    '''
     Class representing a variable CCG category.
     Used for conjunctions (and possibly type-raising, if implemented as a
     unary rule).
-    """
+    '''
 
     _maxID = 0
 
@@ -146,12 +151,13 @@ class CCGVar(AbstractCCGCategory):
 
 
 @total_ordering
+@python_2_unicode_compatible
 class Direction(object):
-    """
+    '''
     Class representing the direction of a function application.
     Also contains maintains information as to which combinators
     may be used with the category.
-    """
+    '''
 
     def __init__(self, dir, restrictions):
         self._dir = dir
@@ -160,10 +166,10 @@ class Direction(object):
 
     # Testing the application direction
     def is_forward(self):
-        return self._dir == "/"
+        return self._dir == '/'
 
     def is_backward(self):
-        return self._dir == "\\"
+        return self._dir == '\\'
 
     def dir(self):
         return self._dir
@@ -178,16 +184,16 @@ class Direction(object):
         return self._restrs
 
     def is_variable(self):
-        return self._restrs == "_"
+        return self._restrs == '_'
 
     # Unification and substitution of variable directions.
     # Used only if type-raising is implemented as a unary rule, as it
     # must inherit restrictions from the argument category.
     def can_unify(self, other):
         if other.is_variable():
-            return [("_", self.restrs())]
+            return [('_', self.restrs())]
         elif self.is_variable():
-            return [("_", other.restrs())]
+            return [('_', other.restrs())]
         else:
             if self.restrs() == other.restrs():
                 return []
@@ -198,16 +204,16 @@ class Direction(object):
             return self
 
         for (var, restrs) in subs:
-            if var == "_":
+            if var == '_':
                 return Direction(self._dir, restrs)
         return self
 
     # Testing permitted combinators
     def can_compose(self):
-        return "," not in self._restrs
+        return ',' not in self._restrs
 
     def can_cross(self):
-        return "." not in self._restrs
+        return '.' not in self._restrs
 
     def __eq__(self, other):
         return (
@@ -241,18 +247,19 @@ class Direction(object):
 
     # The negation operator reverses the direction of the application
     def __neg__(self):
-        if self._dir == "/":
-            return Direction("\\", self._restrs)
+        if self._dir == '/':
+            return Direction('\\', self._restrs)
         else:
-            return Direction("/", self._restrs)
+            return Direction('/', self._restrs)
 
 
+@python_2_unicode_compatible
 class PrimitiveCategory(AbstractCCGCategory):
-    """
+    '''
     Class representing primitive categories.
     Takes a string representation of the category, and a
     list of strings specifying the morphological subcategories.
-    """
+    '''
 
     def __init__(self, categ, restrictions=[]):
         self._categ = categ
@@ -296,16 +303,17 @@ class PrimitiveCategory(AbstractCCGCategory):
     def __str__(self):
         if self._restrs == []:
             return "%s" % self._categ
-        restrictions = "[%s]" % ",".join(repr(r) for r in self._restrs)
+        restrictions = "[%s]" % ",".join(unicode_repr(r) for r in self._restrs)
         return "%s%s" % (self._categ, restrictions)
 
 
+@python_2_unicode_compatible
 class FunctionalCategory(AbstractCCGCategory):
-    """
+    '''
     Class that represents a function application category.
     Consists of argument and result categories, together with
     an application direction.
-    """
+    '''
 
     def __init__(self, res, arg, dir):
         self._res = res
index ab4807c..bd410c7 100644 (file)
@@ -1,6 +1,6 @@
 # Natural Language Toolkit: Combinatory Categorial Grammar
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Graeme Gange <ggange@csse.unimelb.edu.au>
 # URL: <http://nltk.org/>
 # For license information, see LICENSE.TXT
@@ -29,9 +29,12 @@ which should print a nice representation of the derivation.
 This entire process is shown far more clearly in the demonstration:
 python chart.py
 """
+from __future__ import print_function, division, unicode_literals
 
 import itertools
 
+from six import string_types
+
 from nltk.parse import ParserI
 from nltk.parse.chart import AbstractChartRule, EdgeI, Chart
 from nltk.tree import Tree
@@ -48,7 +51,7 @@ from nltk.ccg.combinator import (
     BackwardBx,
     BackwardSx,
 )
-
+from nltk.compat import python_2_unicode_compatible
 from nltk.ccg.combinator import *
 from nltk.ccg.logic import *
 from nltk.sem.logic import *
@@ -102,9 +105,9 @@ class CCGEdge(EdgeI):
 
 
 class CCGLeafEdge(EdgeI):
-    """
+    '''
     Class representing leaf edges in a CCG derivation.
-    """
+    '''
 
     def __init__(self, pos, token, leaf):
         self._pos = pos
@@ -153,11 +156,12 @@ class CCGLeafEdge(EdgeI):
         return self._leaf
 
 
+@python_2_unicode_compatible
 class BinaryCombinatorRule(AbstractChartRule):
-    """
+    '''
     Class implementing application of a binary combinator to a chart.
     Takes the directed combinator to apply.
-    """
+    '''
 
     NUMEDGES = 2
 
@@ -189,12 +193,11 @@ class BinaryCombinatorRule(AbstractChartRule):
 
 # Type-raising must be handled slightly differently to the other rules, as the
 # resulting rules only span a single edge, rather than both edges.
-
-
+@python_2_unicode_compatible
 class ForwardTypeRaiseRule(AbstractChartRule):
-    """
+    '''
     Class for applying forward type raising
-    """
+    '''
 
     NUMEDGES = 2
 
@@ -214,10 +217,11 @@ class ForwardTypeRaiseRule(AbstractChartRule):
         return "%s" % self._combinator
 
 
+@python_2_unicode_compatible
 class BackwardTypeRaiseRule(AbstractChartRule):
-    """
+    '''
     Class for applying backward type raising.
-    """
+    '''
 
     NUMEDGES = 2
 
@@ -260,10 +264,10 @@ DefaultRuleSet = (
 
 
 class CCGChartParser(ParserI):
-    """
+    '''
     Chart parser for CCGs.
     Based largely on the ChartParser class from NLTK.
-    """
+    '''
 
     def __init__(self, lexicon, rules, trace=0):
         self._lexicon = lexicon
@@ -365,7 +369,7 @@ def compute_semantics(children, edge):
         elif isinstance(combinator, UndirectedSubstitution):
             return compute_substitution_semantics(function, argument)
         else:
-            raise AssertionError("Unsupported combinator '" + combinator + "'")
+            raise AssertionError('Unsupported combinator \'' + combinator + '\'')
     else:
         return compute_type_raised_semantics(children[0].label()[0].semantics())
 
@@ -376,8 +380,8 @@ def compute_semantics(children, edge):
 def printCCGDerivation(tree):
     # Get the leaves and initial categories
     leafcats = tree.pos()
-    leafstr = ""
-    catstr = ""
+    leafstr = ''
+    catstr = ''
 
     # Construct a string with both the leaf word and corresponding
     # category aligned.
@@ -386,10 +390,10 @@ def printCCGDerivation(tree):
         nextlen = 2 + max(len(leaf), len(str_cat))
         lcatlen = (nextlen - len(str_cat)) // 2
         rcatlen = lcatlen + (nextlen - len(str_cat)) % 2
-        catstr += " " * lcatlen + str_cat + " " * rcatlen
+        catstr += ' ' * lcatlen + str_cat + ' ' * rcatlen
         lleaflen = (nextlen - len(leaf)) // 2
         rleaflen = lleaflen + (nextlen - len(leaf)) % 2
-        leafstr += " " * lleaflen + leaf + " " * rleaflen
+        leafstr += ' ' * lleaflen + leaf + ' ' * rleaflen
     print(leafstr.rstrip())
     print(catstr.rstrip())
 
@@ -419,18 +423,18 @@ def printCCGTree(lwidth, tree):
 
     (token, op) = tree.label()
 
-    if op == "Leaf":
+    if op == 'Leaf':
         return rwidth
 
     # Pad to the left with spaces, followed by a sequence of '-'
     # and the derivation rule.
-    print(lwidth * " " + (rwidth - lwidth) * "-" + "%s" % op)
+    print(lwidth * ' ' + (rwidth - lwidth) * '-' + "%s" % op)
     # Print the resulting category on a new line.
     str_res = "%s" % (token.categ())
     if token.semantics() is not None:
         str_res += " {" + str(token.semantics()) + "}"
     respadlen = (rwidth - lwidth - len(str_res)) // 2 + lwidth
-    print(respadlen * " " + str_res)
+    print(respadlen * ' ' + str_res)
     return rwidth
 
 
@@ -438,7 +442,7 @@ def printCCGTree(lwidth, tree):
 
 # Construct the lexicon
 lex = fromstring(
-    """
+    '''
     :- S, NP, N, VP    # Primitive categories, S is the target primitive
 
     Det :: NP/N         # Family of words
@@ -467,7 +471,7 @@ lex = fromstring(
     mushrooms => N
     parsnips => N
     bacon => N
-    """
+    '''
 )
 
 
@@ -477,5 +481,5 @@ def demo():
         printCCGDerivation(parse)
 
 
-if __name__ == "__main__":
+if __name__ == '__main__':
     demo()
index 60bb149..56f15ed 100644 (file)
@@ -1,6 +1,6 @@
 # Natural Language Toolkit: Combinatory Categorial Grammar
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Graeme Gange <ggange@csse.unimelb.edu.au>
 # URL: <http://nltk.org/>
 # For license information, see LICENSE.TXT
@@ -8,12 +8,16 @@
 CCG Combinators
 """
 
+from __future__ import unicode_literals
 from abc import ABCMeta, abstractmethod
+from six import add_metaclass
 
+from nltk.compat import python_2_unicode_compatible
 from nltk.ccg.api import FunctionalCategory
 
 
-class UndirectedBinaryCombinator(metaclass=ABCMeta):
+@add_metaclass(ABCMeta)
+class UndirectedBinaryCombinator(object):
     """
     Abstract class for representing a binary combinator.
     Merely defines functions for checking if the function and argument
@@ -34,7 +38,8 @@ class UndirectedBinaryCombinator(metaclass=ABCMeta):
         pass
 
 
-class DirectedBinaryCombinator(metaclass=ABCMeta):
+@add_metaclass(ABCMeta)
+class DirectedBinaryCombinator(object):
     """
     Wrapper for the undirected binary combinator.
     It takes left and right categories, and decides which is to be
@@ -51,6 +56,7 @@ class DirectedBinaryCombinator(metaclass=ABCMeta):
         pass
 
 
+@python_2_unicode_compatible
 class ForwardCombinator(DirectedBinaryCombinator):
     """
     Class representing combinators where the primary functor is on the left.
@@ -59,7 +65,7 @@ class ForwardCombinator(DirectedBinaryCombinator):
     restricting the cases in which it may apply.
     """
 
-    def __init__(self, combinator, predicate, suffix=""):
+    def __init__(self, combinator, predicate, suffix=''):
         self._combinator = combinator
         self._predicate = predicate
         self._suffix = suffix
@@ -77,12 +83,13 @@ class ForwardCombinator(DirectedBinaryCombinator):
         return ">%s%s" % (self._combinator, self._suffix)
 
 
+@python_2_unicode_compatible
 class BackwardCombinator(DirectedBinaryCombinator):
     """
     The backward equivalent of the ForwardCombinator class.
     """
 
-    def __init__(self, combinator, predicate, suffix=""):
+    def __init__(self, combinator, predicate, suffix=''):
         self._combinator = combinator
         self._predicate = predicate
         self._suffix = suffix
@@ -100,6 +107,7 @@ class BackwardCombinator(DirectedBinaryCombinator):
         return "<%s%s" % (self._combinator, self._suffix)
 
 
+@python_2_unicode_compatible
 class UndirectedFunctionApplication(UndirectedBinaryCombinator):
     """
     Class representing function application.
@@ -125,7 +133,7 @@ class UndirectedFunctionApplication(UndirectedBinaryCombinator):
         yield function.res().substitute(subs)
 
     def __str__(self):
-        return ""
+        return ''
 
 
 # Predicates for function application.
@@ -145,6 +153,7 @@ ForwardApplication = ForwardCombinator(UndirectedFunctionApplication(), forwardO
 BackwardApplication = BackwardCombinator(UndirectedFunctionApplication(), backwardOnly)
 
 
+@python_2_unicode_compatible
 class UndirectedComposition(UndirectedBinaryCombinator):
     """
     Functional composition (harmonic) combinator.
@@ -175,7 +184,7 @@ class UndirectedComposition(UndirectedBinaryCombinator):
                 )
 
     def __str__(self):
-        return "B"
+        return 'B'
 
 
 # Predicates for restricting application of straight composition.
@@ -209,10 +218,11 @@ BackwardComposition = BackwardCombinator(UndirectedComposition(), backwardOnly)
 
 # Backward crossed composition
 BackwardBx = BackwardCombinator(
-    UndirectedComposition(), backwardBxConstraint, suffix="x"
+    UndirectedComposition(), backwardBxConstraint, suffix='x'
 )
 
 
+@python_2_unicode_compatible
 class UndirectedSubstitution(UndirectedBinaryCombinator):
     """
     Substitution (permutation) combinator.
@@ -245,7 +255,7 @@ class UndirectedSubstitution(UndirectedBinaryCombinator):
             )
 
     def __str__(self):
-        return "S"
+        return 'S'
 
 
 # Predicate for forward substitution
@@ -266,7 +276,7 @@ def backwardSxConstraint(left, right):
 
 # Instances of substitution combinators
 ForwardSubstitution = ForwardCombinator(UndirectedSubstitution(), forwardSConstraint)
-BackwardSx = BackwardCombinator(UndirectedSubstitution(), backwardSxConstraint, "x")
+BackwardSx = BackwardCombinator(UndirectedSubstitution(), backwardSxConstraint, 'x')
 
 
 # Retrieves the left-most functional category.
@@ -277,6 +287,7 @@ def innermostFunction(categ):
     return categ
 
 
+@python_2_unicode_compatible
 class UndirectedTypeRaise(UndirectedBinaryCombinator):
     """
     Undirected combinator for type raising.
@@ -318,7 +329,7 @@ class UndirectedTypeRaise(UndirectedBinaryCombinator):
             )
 
     def __str__(self):
-        return "T"
+        return 'T'
 
 
 # Predicates for type-raising
index 628eb8a..d8e2bf3 100644 (file)
@@ -1,6 +1,6 @@
 # Natural Language Toolkit: Combinatory Categorial Grammar
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Graeme Gange <ggange@csse.unimelb.edu.au>
 # URL: <http://nltk.org/>
 # For license information, see LICENSE.TXT
@@ -8,10 +8,13 @@
 CCG Lexicons
 """
 
+from __future__ import unicode_literals
+
 import re
 from collections import defaultdict
 
 from nltk.ccg.api import PrimitiveCategory, Direction, CCGVar, FunctionalCategory
+from nltk.compat import python_2_unicode_compatible
 from nltk.internals import deprecated
 
 from nltk.sem.logic import Expression
@@ -21,26 +24,26 @@ from nltk.sem.logic import Expression
 # ------------
 
 # Parses a primitive category and subscripts
-PRIM_RE = re.compile(r"""([A-Za-z]+)(\[[A-Za-z,]+\])?""")
+PRIM_RE = re.compile(r'''([A-Za-z]+)(\[[A-Za-z,]+\])?''')
 
 # Separates the next primitive category from the remainder of the
 # string
-NEXTPRIM_RE = re.compile(r"""([A-Za-z]+(?:\[[A-Za-z,]+\])?)(.*)""")
+NEXTPRIM_RE = re.compile(r'''([A-Za-z]+(?:\[[A-Za-z,]+\])?)(.*)''')
 
 # Separates the next application operator from the remainder
-APP_RE = re.compile(r"""([\\/])([.,]?)([.,]?)(.*)""")
+APP_RE = re.compile(r'''([\\/])([.,]?)([.,]?)(.*)''')
 
 # Parses the definition of the right-hand side (rhs) of either a word or a family
-LEX_RE = re.compile(r"""([\S_]+)\s*(::|[-=]+>)\s*(.+)""", re.UNICODE)
+LEX_RE = re.compile(r'''([\S_]+)\s*(::|[-=]+>)\s*(.+)''', re.UNICODE)
 
 # Parses the right hand side that contains category and maybe semantic predicate
-RHS_RE = re.compile(r"""([^{}]*[^ {}])\s*(\{[^}]+\})?""", re.UNICODE)
+RHS_RE = re.compile(r'''([^{}]*[^ {}])\s*(\{[^}]+\})?''', re.UNICODE)
 
 # Parses the semantic predicate
-SEMANTICS_RE = re.compile(r"""\{([^}]+)\}""", re.UNICODE)
+SEMANTICS_RE = re.compile(r'''\{([^}]+)\}''', re.UNICODE)
 
 # Strips comments from a line
-COMMENTS_RE = re.compile("""([^#]*)(?:#.*)?""")
+COMMENTS_RE = re.compile('''([^#]*)(?:#.*)?''')
 
 
 class Token(object):
@@ -78,6 +81,7 @@ class Token(object):
         return cmp((self._categ, self._semantics), other.categ(), other.semantics())
 
 
+@python_2_unicode_compatible
 class CCGLexicon(object):
     """
     Class representing a lexicon for CCG grammars.
@@ -139,16 +143,16 @@ def matchBrackets(string):
     rest = string[1:]
     inside = "("
 
-    while rest != "" and not rest.startswith(")"):
-        if rest.startswith("("):
+    while rest != "" and not rest.startswith(')'):
+        if rest.startswith('('):
             (part, rest) = matchBrackets(rest)
             inside = inside + part
         else:
             inside = inside + rest[0]
             rest = rest[1:]
-    if rest.startswith(")"):
-        return (inside + ")", rest[1:])
-    raise AssertionError("Unmatched bracket in string '" + string + "'")
+    if rest.startswith(')'):
+        return (inside + ')', rest[1:])
+    raise AssertionError('Unmatched bracket in string \'' + string + '\'')
 
 
 def nextCategory(string):
@@ -156,7 +160,7 @@ def nextCategory(string):
     Separate the string for the next portion of the category from the rest
     of the string
     """
-    if string.startswith("("):
+    if string.startswith('('):
         return matchBrackets(string)
     return NEXTPRIM_RE.match(string).groups()
 
@@ -173,7 +177,7 @@ def parseSubscripts(subscr):
     Parse the subscripts for a primitive category
     """
     if subscr:
-        return subscr[1:-1].split(",")
+        return subscr[1:-1].split(',')
     return []
 
 
@@ -203,7 +207,7 @@ def parsePrimitiveCategory(chunks, primitives, families, var):
         subscrs = parseSubscripts(chunks[1])
         return (PrimitiveCategory(catstr, subscrs), var)
     raise AssertionError(
-        "String '" + catstr + "' is neither a family nor primitive category."
+        'String \'' + catstr + '\' is neither a family nor primitive category.'
     )
 
 
@@ -214,10 +218,11 @@ def augParseCategory(line, primitives, families, var=None):
     """
     (cat_string, rest) = nextCategory(line)
 
-    if cat_string.startswith("("):
+    if cat_string.startswith('('):
         (res, var) = augParseCategory(cat_string[1:-1], primitives, families, var)
 
     else:
+        #        print rePrim.match(str).groups()
         (res, var) = parsePrimitiveCategory(
             PRIM_RE.match(cat_string).groups(), primitives, families, var
         )
@@ -228,7 +233,7 @@ def augParseCategory(line, primitives, families, var=None):
         rest = app[3]
 
         (cat_string, rest) = nextCategory(rest)
-        if cat_string.startswith("("):
+        if cat_string.startswith('('):
             (arg, var) = augParseCategory(cat_string[1:-1], primitives, families, var)
         else:
             (arg, var) = parsePrimitiveCategory(
@@ -253,12 +258,12 @@ def fromstring(lex_str, include_semantics=False):
         if line == "":
             continue
 
-        if line.startswith(":-"):
+        if line.startswith(':-'):
             # A line of primitive categories.
             # The first one is the target category
             # ie, :- S, N, NP, VP
             primitives = primitives + [
-                prim.strip() for prim in line[2:].strip().split(",")
+                prim.strip() for prim in line[2:].strip().split(',')
             ]
         else:
             # Either a family definition, or a word definition
@@ -266,7 +271,7 @@ def fromstring(lex_str, include_semantics=False):
             (catstr, semantics_str) = RHS_RE.match(rhs).groups()
             (cat, var) = augParseCategory(catstr, primitives, families)
 
-            if sep == "::":
+            if sep == '::':
                 # Family definition
                 # ie, Det :: NP/N
                 families[ident] = (cat, var)
@@ -288,7 +293,7 @@ def fromstring(lex_str, include_semantics=False):
     return CCGLexicon(primitives[0], primitives, families, entries)
 
 
-@deprecated("Use fromstring() instead.")
+@deprecated('Use fromstring() instead.')
 def parseLexicon(lex_str):
     return fromstring(lex_str)
 
index 37b87f3..b89bea9 100644 (file)
@@ -1,6 +1,6 @@
 # Natural Language Toolkit: Combinatory Categorial Grammar
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Tanin Na Nakorn (@tanin)
 # URL: <http://nltk.org/>
 # For license information, see LICENSE.TXT
index d34def9..cd0ad40 100644 (file)
@@ -1,6 +1,6 @@
 # Natural Language Toolkit: Chatbots
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Authors: Steven Bird <stevenbird1@gmail.com>
 # URL: <http://nltk.org/>
 # For license information, see LICENSE.TXT
@@ -15,6 +15,7 @@ typed by users, and respond with automatically generated sentences.
 These chatbots may not work using the windows command line or the
 windows IDLE GUI.
 """
+from __future__ import print_function
 
 from nltk.chat.util import Chat
 from nltk.chat.eliza import eliza_chat
@@ -24,28 +25,28 @@ from nltk.chat.suntsu import suntsu_chat
 from nltk.chat.zen import zen_chat
 
 bots = [
-    (eliza_chat, "Eliza (psycho-babble)"),
-    (iesha_chat, "Iesha (teen anime junky)"),
-    (rude_chat, "Rude (abusive bot)"),
-    (suntsu_chat, "Suntsu (Chinese sayings)"),
-    (zen_chat, "Zen (gems of wisdom)"),
+    (eliza_chat, 'Eliza (psycho-babble)'),
+    (iesha_chat, 'Iesha (teen anime junky)'),
+    (rude_chat, 'Rude (abusive bot)'),
+    (suntsu_chat, 'Suntsu (Chinese sayings)'),
+    (zen_chat, 'Zen (gems of wisdom)'),
 ]
 
 
 def chatbots():
     import sys
 
-    print("Which chatbot would you like to talk to?")
+    print('Which chatbot would you like to talk to?')
     botcount = len(bots)
     for i in range(botcount):
-        print("  %d: %s" % (i + 1, bots[i][1]))
+        print('  %d: %s' % (i + 1, bots[i][1]))
     while True:
-        print("\nEnter a number in the range 1-%d: " % botcount, end=" ")
+        print('\nEnter a number in the range 1-%d: ' % botcount, end=' ')
         choice = sys.stdin.readline().strip()
         if choice.isdigit() and (int(choice) - 1) in range(botcount):
             break
         else:
-            print("   Error: bad chatbot number")
+            print('   Error: bad chatbot number')
 
     chatbot = bots[int(choice) - 1][0]
     chatbot()
index 2af6a37..8967c15 100644 (file)
Binary files a/nlp_resource_data/nltk/chat/__pycache__/__init__.cpython-37.pyc and b/nlp_resource_data/nltk/chat/__pycache__/__init__.cpython-37.pyc differ
index 2847528..22a2521 100644 (file)
Binary files a/nlp_resource_data/nltk/chat/__pycache__/eliza.cpython-37.pyc and b/nlp_resource_data/nltk/chat/__pycache__/eliza.cpython-37.pyc differ
index 9abe7ff..9be3d89 100644 (file)
Binary files a/nlp_resource_data/nltk/chat/__pycache__/iesha.cpython-37.pyc and b/nlp_resource_data/nltk/chat/__pycache__/iesha.cpython-37.pyc differ
index ee564b2..fa38e07 100644 (file)
Binary files a/nlp_resource_data/nltk/chat/__pycache__/rude.cpython-37.pyc and b/nlp_resource_data/nltk/chat/__pycache__/rude.cpython-37.pyc differ
index 1487e3f..2278d0e 100644 (file)
Binary files a/nlp_resource_data/nltk/chat/__pycache__/suntsu.cpython-37.pyc and b/nlp_resource_data/nltk/chat/__pycache__/suntsu.cpython-37.pyc differ
index 541a5eb..35f68fd 100644 (file)
Binary files a/nlp_resource_data/nltk/chat/__pycache__/util.cpython-37.pyc and b/nlp_resource_data/nltk/chat/__pycache__/util.cpython-37.pyc differ
index 2003d48..a585b4c 100644 (file)
Binary files a/nlp_resource_data/nltk/chat/__pycache__/zen.cpython-37.pyc and b/nlp_resource_data/nltk/chat/__pycache__/zen.cpython-37.pyc differ
index 4731c8f..ef23b80 100644 (file)
@@ -1,6 +1,6 @@
 # Natural Language Toolkit: Eliza
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Authors: Steven Bird <stevenbird1@gmail.com>
 #          Edward Loper <edloper@gmail.com>
 # URL: <http://nltk.org/>
@@ -12,6 +12,7 @@
 # a translation table used to convert things you say into things the
 # computer says back, e.g. "I am" --> "you are"
 
+from __future__ import print_function
 from nltk.chat.util import Chat, reflections
 
 # a table of response pairs, where each pair consists of a
@@ -20,7 +21,7 @@ from nltk.chat.util import Chat, reflections
 
 pairs = (
     (
-        r"I need (.*)",
+        r'I need (.*)',
         (
             "Why do you need %1?",
             "Would it really help you to get %1?",
@@ -28,7 +29,7 @@ pairs = (
         ),
     ),
     (
-        r"Why don\'t you (.*)",
+        r'Why don\'t you (.*)',
         (
             "Do you really think I don't %1?",
             "Perhaps eventually I will %1.",
@@ -36,7 +37,7 @@ pairs = (
         ),
     ),
     (
-        r"Why can\'t I (.*)",
+        r'Why can\'t I (.*)',
         (
             "Do you think you should be able to %1?",
             "If you could %1, what would you do?",
@@ -45,7 +46,7 @@ pairs = (
         ),
     ),
     (
-        r"I can\'t (.*)",
+        r'I can\'t (.*)',
         (
             "How do you know you can't %1?",
             "Perhaps you could %1 if you tried.",
@@ -53,7 +54,7 @@ pairs = (
         ),
     ),
     (
-        r"I am (.*)",
+        r'I am (.*)',
         (
             "Did you come to me because you are %1?",
             "How long have you been %1?",
@@ -61,7 +62,7 @@ pairs = (
         ),
     ),
     (
-        r"I\'m (.*)",
+        r'I\'m (.*)',
         (
             "How does being %1 make you feel?",
             "Do you enjoy being %1?",
@@ -70,7 +71,7 @@ pairs = (
         ),
     ),
     (
-        r"Are you (.*)",
+        r'Are you (.*)',
         (
             "Why does it matter whether I am %1?",
             "Would you prefer it if I were not %1?",
@@ -79,7 +80,7 @@ pairs = (
         ),
     ),
     (
-        r"What (.*)",
+        r'What (.*)',
         (
             "Why do you ask?",
             "How would an answer to that help you?",
@@ -87,7 +88,7 @@ pairs = (
         ),
     ),
     (
-        r"How (.*)",
+        r'How (.*)',
         (
             "How do you suppose?",
             "Perhaps you can answer your own question.",
@@ -95,7 +96,7 @@ pairs = (
         ),
     ),
     (
-        r"Because (.*)",
+        r'Because (.*)',
         (
             "Is that the real reason?",
             "What other reasons come to mind?",
@@ -104,14 +105,14 @@ pairs = (
         ),
     ),
     (
-        r"(.*) sorry (.*)",
+        r'(.*) sorry (.*)',
         (
             "There are many times when no apology is needed.",
             "What feelings do you have when you apologize?",
         ),
     ),
     (
-        r"Hello(.*)",
+        r'Hello(.*)',
         (
             "Hello... I'm glad you could drop by today.",
             "Hi there... how are you today?",
@@ -119,20 +120,20 @@ pairs = (
         ),
     ),
     (
-        r"I think (.*)",
+        r'I think (.*)',
         ("Do you doubt %1?", "Do you really think so?", "But you're not sure %1?"),
     ),
     (
-        r"(.*) friend (.*)",
+        r'(.*) friend (.*)',
         (
             "Tell me more about your friends.",
             "When you think of a friend, what comes to mind?",
             "Why don't you tell me about a childhood friend?",
         ),
     ),
-    (r"Yes", ("You seem quite sure.", "OK, but can you elaborate a bit?")),
+    (r'Yes', ("You seem quite sure.", "OK, but can you elaborate a bit?")),
     (
-        r"(.*) computer(.*)",
+        r'(.*) computer(.*)',
         (
             "Are you really talking about me?",
             "Does it seem strange to talk to a computer?",
@@ -141,7 +142,7 @@ pairs = (
         ),
     ),
     (
-        r"Is it (.*)",
+        r'Is it (.*)',
         (
             "Do you think it is %1?",
             "Perhaps it's %1 -- what do you think?",
@@ -150,14 +151,14 @@ pairs = (
         ),
     ),
     (
-        r"It is (.*)",
+        r'It is (.*)',
         (
             "You seem very certain.",
             "If I told you that it probably isn't %1, what would you feel?",
         ),
     ),
     (
-        r"Can you (.*)",
+        r'Can you (.*)',
         (
             "What makes you think I can't %1?",
             "If I could %1, then what?",
@@ -165,7 +166,7 @@ pairs = (
         ),
     ),
     (
-        r"Can I (.*)",
+        r'Can I (.*)',
         (
             "Perhaps you don't want to %1.",
             "Do you want to be able to %1?",
@@ -173,7 +174,7 @@ pairs = (
         ),
     ),
     (
-        r"You are (.*)",
+        r'You are (.*)',
         (
             "Why do you think I am %1?",
             "Does it please you to think that I'm %1?",
@@ -182,7 +183,7 @@ pairs = (
         ),
     ),
     (
-        r"You\'re (.*)",
+        r'You\'re (.*)',
         (
             "Why do you say I am %1?",
             "Why do you think I am %1?",
@@ -190,11 +191,11 @@ pairs = (
         ),
     ),
     (
-        r"I don\'t (.*)",
+        r'I don\'t (.*)',
         ("Don't you really %1?", "Why don't you %1?", "Do you want to %1?"),
     ),
     (
-        r"I feel (.*)",
+        r'I feel (.*)',
         (
             "Good, tell me more about these feelings.",
             "Do you often feel %1?",
@@ -203,7 +204,7 @@ pairs = (
         ),
     ),
     (
-        r"I have (.*)",
+        r'I have (.*)',
         (
             "Why do you tell me that you've %1?",
             "Have you really %1?",
@@ -211,7 +212,7 @@ pairs = (
         ),
     ),
     (
-        r"I would (.*)",
+        r'I would (.*)',
         (
             "Could you explain why you would %1?",
             "Why would you %1?",
@@ -219,7 +220,7 @@ pairs = (
         ),
     ),
     (
-        r"Is there (.*)",
+        r'Is there (.*)',
         (
             "Do you think there is %1?",
             "It's likely that there is %1.",
@@ -227,7 +228,7 @@ pairs = (
         ),
     ),
     (
-        r"My (.*)",
+        r'My (.*)',
         (
             "I see, your %1.",
             "Why do you say that your %1?",
@@ -235,16 +236,16 @@ pairs = (
         ),
     ),
     (
-        r"You (.*)",
+        r'You (.*)',
         (
             "We should be discussing you, not me.",
             "Why do you say that about me?",
             "Why do you care whether I %1?",
         ),
     ),
-    (r"Why (.*)", ("Why don't you tell me the reason why %1?", "Why do you think %1?")),
+    (r'Why (.*)', ("Why don't you tell me the reason why %1?", "Why do you think %1?")),
     (
-        r"I want (.*)",
+        r'I want (.*)',
         (
             "What would it mean to you if you got %1?",
             "Why do you want %1?",
@@ -253,7 +254,7 @@ pairs = (
         ),
     ),
     (
-        r"(.*) mother(.*)",
+        r'(.*) mother(.*)',
         (
             "Tell me more about your mother.",
             "What was your relationship with your mother like?",
@@ -263,7 +264,7 @@ pairs = (
         ),
     ),
     (
-        r"(.*) father(.*)",
+        r'(.*) father(.*)',
         (
             "Tell me more about your father.",
             "How did your father make you feel?",
@@ -273,7 +274,7 @@ pairs = (
         ),
     ),
     (
-        r"(.*) child(.*)",
+        r'(.*) child(.*)',
         (
             "Did you have close friends as a child?",
             "What is your favorite childhood memory?",
@@ -283,7 +284,7 @@ pairs = (
         ),
     ),
     (
-        r"(.*)\?",
+        r'(.*)\?',
         (
             "Why do you ask that?",
             "Please consider whether you can answer your own question.",
@@ -292,7 +293,7 @@ pairs = (
         ),
     ),
     (
-        r"quit",
+        r'quit',
         (
             "Thank you for talking with me.",
             "Good-bye.",
@@ -300,7 +301,7 @@ pairs = (
         ),
     ),
     (
-        r"(.*)",
+        r'(.*)',
         (
             "Please tell me more.",
             "Let's change focus a bit... Tell me about your family.",
@@ -323,7 +324,7 @@ def eliza_chat():
     print("Therapist\n---------")
     print("Talk to the program by typing in plain English, using normal upper-")
     print('and lower-case letters and punctuation.  Enter "quit" when done.')
-    print("=" * 72)
+    print('=' * 72)
     print("Hello.  How are you feeling today?")
 
     eliza_chatbot.converse()
index 55318af..4a7a615 100644 (file)
@@ -1,6 +1,6 @@
 # Natural Language Toolkit: Teen Chatbot
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Selina Dennis <sjmd@csse.unimelb.edu.au>
 # URL: <http://nltk.org/>
 # For license information, see LICENSE.TXT
@@ -10,6 +10,7 @@ This chatbot is a tongue-in-cheek take on the average teen
 anime junky that frequents YahooMessenger or MSNM.
 All spelling mistakes and flawed grammar are intentional.
 """
+from __future__ import print_function
 
 from nltk.chat.util import Chat
 
@@ -40,27 +41,27 @@ reflections = {
 
 pairs = (
     (
-        r"I\'m (.*)",
+        r'I\'m (.*)',
         (
             "ur%1?? that's so cool! kekekekeke ^_^ tell me more!",
             "ur%1? neat!! kekeke >_<",
         ),
     ),
     (
-        r"(.*) don\'t you (.*)",
+        r'(.*) don\'t you (.*)',
         (
             "u think I can%2??! really?? kekeke \<_\<",
             "what do u mean%2??!",
             "i could if i wanted, don't you think!! kekeke",
         ),
     ),
-    (r"ye[as] [iI] (.*)", ("u%1? cool!! how?", "how come u%1??", "u%1? so do i!!")),
+    (r'ye[as] [iI] (.*)', ("u%1? cool!! how?", "how come u%1??", "u%1? so do i!!")),
     (
-        r"do (you|u) (.*)\??",
+        r'do (you|u) (.*)\??',
         ("do i%2? only on tuesdays! kekeke *_*", "i dunno! do u%2??"),
     ),
     (
-        r"(.*)\?",
+        r'(.*)\?',
         (
             "man u ask lots of questions!",
             "booooring! how old r u??",
@@ -68,11 +69,11 @@ pairs = (
         ),
     ),
     (
-        r"(cos|because) (.*)",
+        r'(cos|because) (.*)',
         ("hee! i don't believe u! >_<", "nuh-uh! >_<", "ooooh i agree!"),
     ),
     (
-        r"why can\'t [iI] (.*)",
+        r'why can\'t [iI] (.*)',
         (
             "i dunno! y u askin me for!",
             "try harder, silly! hee! ^_^",
@@ -80,7 +81,7 @@ pairs = (
         ),
     ),
     (
-        r"I can\'t (.*)",
+        r'I can\'t (.*)',
         (
             "u can't what??! >_<",
             "that's ok! i can't%1 either! kekekekeke ^_^",
@@ -88,7 +89,7 @@ pairs = (
         ),
     ),
     (
-        r"(.*) (like|love|watch) anime",
+        r'(.*) (like|love|watch) anime',
         (
             "omg i love anime!! do u like sailor moon??! ^&^",
             "anime yay! anime rocks sooooo much!",
@@ -98,11 +99,11 @@ pairs = (
         ),
     ),
     (
-        r"I (like|love|watch|play) (.*)",
+        r'I (like|love|watch|play) (.*)',
         ("yay! %2 rocks!", "yay! %2 is neat!", "cool! do u like other stuff?? ^_^"),
     ),
     (
-        r"anime sucks|(.*) (hate|detest) anime",
+        r'anime sucks|(.*) (hate|detest) anime',
         (
             "ur a liar! i'm not gonna talk to u nemore if u h8 anime *;*",
             "no way! anime is the best ever!",
@@ -110,17 +111,17 @@ pairs = (
         ),
     ),
     (
-        r"(are|r) (you|u) (.*)",
+        r'(are|r) (you|u) (.*)',
         ("am i%1??! how come u ask that!", "maybe!  y shud i tell u?? kekeke >_>"),
     ),
     (
-        r"what (.*)",
+        r'what (.*)',
         ("hee u think im gonna tell u? .v.", "booooooooring! ask me somethin else!"),
     ),
-    (r"how (.*)", ("not tellin!! kekekekekeke ^_^",)),
-    (r"(hi|hello|hey) (.*)", ("hi!!! how r u!!",)),
+    (r'how (.*)', ("not tellin!! kekekekekeke ^_^",)),
+    (r'(hi|hello|hey) (.*)', ("hi!!! how r u!!",)),
     (
-        r"quit",
+        r'quit',
         (
             "mom says i have to go eat dinner now :,( bye!!",
             "awww u have to go?? see u next time!!",
@@ -128,7 +129,7 @@ pairs = (
         ),
     ),
     (
-        r"(.*)",
+        r'(.*)',
         (
             "ur funny! kekeke",
             "boooooring! talk about something else! tell me wat u like!",
@@ -146,7 +147,7 @@ def iesha_chat():
     print("Iesha the TeenBoT\n---------")
     print("Talk to the program by typing in plain English, using normal upper-")
     print('and lower-case letters and punctuation.  Enter "quit" when done.')
-    print("=" * 72)
+    print('=' * 72)
     print("hi!! i'm iesha! who r u??!")
 
     iesha_chatbot.converse()
index c7b1b1b..c9c9de8 100644 (file)
@@ -1,15 +1,16 @@
 # Natural Language Toolkit: Rude Chatbot
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Peter Spiller <pspiller@csse.unimelb.edu.au>
 # URL: <http://nltk.org/>
 # For license information, see LICENSE.TXT
+from __future__ import print_function
 
 from nltk.chat.util import Chat, reflections
 
 pairs = (
     (
-        r"We (.*)",
+        r'We (.*)',
         (
             "What do you mean, 'we'?",
             "Don't include me in that!",
@@ -17,11 +18,11 @@ pairs = (
         ),
     ),
     (
-        r"You should (.*)",
+        r'You should (.*)',
         ("Don't tell me what to do, buddy.", "Really? I should, should I?"),
     ),
     (
-        r"You\'re(.*)",
+        r'You\'re(.*)',
         (
             "More like YOU'RE %1!",
             "Hah! Look who's talking.",
@@ -29,7 +30,7 @@ pairs = (
         ),
     ),
     (
-        r"You are(.*)",
+        r'You are(.*)',
         (
             "More like YOU'RE %1!",
             "Hah! Look who's talking.",
@@ -37,7 +38,7 @@ pairs = (
         ),
     ),
     (
-        r"I can\'t(.*)",
+        r'I can\'t(.*)',
         (
             "You do sound like the type who can't %1.",
             "Hear that splashing sound? That's my heart bleeding for you.",
@@ -45,14 +46,14 @@ pairs = (
         ),
     ),
     (
-        r"I think (.*)",
+        r'I think (.*)',
         (
             "I wouldn't think too hard if I were you.",
             "You actually think? I'd never have guessed...",
         ),
     ),
     (
-        r"I (.*)",
+        r'I (.*)',
         (
             "I'm getting a bit tired of hearing about you.",
             "How about we talk about me instead?",
@@ -60,23 +61,23 @@ pairs = (
         ),
     ),
     (
-        r"How (.*)",
+        r'How (.*)',
         (
             "How do you think?",
             "Take a wild guess.",
             "I'm not even going to dignify that with an answer.",
         ),
     ),
-    (r"What (.*)", ("Do I look like an encyclopedia?", "Figure it out yourself.")),
+    (r'What (.*)', ("Do I look like an encyclopedia?", "Figure it out yourself.")),
     (
-        r"Why (.*)",
+        r'Why (.*)',
         (
             "Why not?",
             "That's so obvious I thought even you'd have already figured it out.",
         ),
     ),
     (
-        r"(.*)shut up(.*)",
+        r'(.*)shut up(.*)',
         (
             "Make me.",
             "Getting angry at a feeble NLP assignment? Somebody's losing it.",
@@ -84,7 +85,7 @@ pairs = (
         ),
     ),
     (
-        r"Shut up(.*)",
+        r'Shut up(.*)',
         (
             "Make me.",
             "Getting angry at a feeble NLP assignment? Somebody's losing it.",
@@ -92,11 +93,11 @@ pairs = (
         ),
     ),
     (
-        r"Hello(.*)",
+        r'Hello(.*)',
         ("Oh good, somebody else to talk to. Joy.", "'Hello'? How original..."),
     ),
     (
-        r"(.*)",
+        r'(.*)',
         (
             "I'm getting bored here. Become more interesting.",
             "Either become more thrilling or get lost, buddy.",
@@ -111,7 +112,7 @@ rude_chatbot = Chat(pairs, reflections)
 def rude_chat():
     print("Talk to the program by typing in plain English, using normal upper-")
     print('and lower-case letters and punctuation.  Enter "quit" when done.')
-    print("=" * 72)
+    print('=' * 72)
     print("I suppose I should say hello.")
 
     rude_chatbot.converse()
index 4c68a77..9f6dc34 100644 (file)
@@ -1,6 +1,6 @@
 # Natural Language Toolkit: Sun Tsu-Bot
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Sam Huston 2007
 # URL: <http://nltk.org/>
 # For license information, see LICENSE.TXT
@@ -13,27 +13,28 @@ Translated by LIONEL GILES, M.A. 1910
 Hosted by the Gutenberg Project
 http://www.gutenberg.org/
 """
+from __future__ import print_function
 
 from nltk.chat.util import Chat, reflections
 
 pairs = (
-    (r"quit", ("Good-bye.", "Plan well", "May victory be your future")),
+    (r'quit', ("Good-bye.", "Plan well", "May victory be your future")),
     (
-        r"[^\?]*\?",
+        r'[^\?]*\?',
         (
             "Please consider whether you can answer your own question.",
             "Ask me no questions!",
         ),
     ),
     (
-        r"[0-9]+(.*)",
+        r'[0-9]+(.*)',
         (
             "It is the rule in war, if our forces are ten to the enemy's one, to surround him; if five to one, to attack him; if twice as numerous, to divide our army into two.",
             "There are five essentials for victory",
         ),
     ),
     (
-        r"[A-Ca-c](.*)",
+        r'[A-Ca-c](.*)',
         (
             "The art of war is of vital importance to the State.",
             "All warfare is based on deception.",
@@ -44,7 +45,7 @@ pairs = (
         ),
     ),
     (
-        r"[D-Fd-f](.*)",
+        r'[D-Fd-f](.*)',
         (
             "The skillful soldier does not raise a second levy, neither are his supply-wagons loaded more than twice.",
             "Bring war material with you from home, but forage on the enemy.",
@@ -53,7 +54,7 @@ pairs = (
         ),
     ),
     (
-        r"[G-Ig-i](.*)",
+        r'[G-Ig-i](.*)',
         (
             "Heaven signifies night and day, cold and heat, times and seasons.",
             "It is the rule in war, if our forces are ten to the enemy's one, to surround him; if five to one, to attack him; if twice as numerous, to divide our army into two.",
@@ -62,7 +63,7 @@ pairs = (
         ),
     ),
     (
-        r"[J-Lj-l](.*)",
+        r'[J-Lj-l](.*)',
         (
             "There are three ways in which a ruler can bring misfortune upon his army.",
             "By commanding the army to advance or to retreat, being ignorant of the fact that it cannot obey. This is called hobbling the army.",
@@ -77,7 +78,7 @@ pairs = (
         ),
     ),
     (
-        r"[M-Om-o](.*)",
+        r'[M-Om-o](.*)',
         (
             "If you know the enemy and know yourself, you need not fear the result of a hundred battles.",
             "If you know yourself but not the enemy, for every victory gained you will also suffer a defeat.",
@@ -86,7 +87,7 @@ pairs = (
         ),
     ),
     (
-        r"[P-Rp-r](.*)",
+        r'[P-Rp-r](.*)',
         (
             "Security against defeat implies defensive tactics; ability to defeat the enemy means taking the offensive.",
             "Standing on the defensive indicates insufficient strength; attacking, a superabundance of strength.",
@@ -96,7 +97,7 @@ pairs = (
         ),
     ),
     (
-        r"[S-Us-u](.*)",
+        r'[S-Us-u](.*)',
         (
             "What the ancients called a clever fighter is one who not only wins, but excels in winning with ease.",
             "Hence his victories bring him neither reputation for wisdom nor credit for courage.",
@@ -107,7 +108,7 @@ pairs = (
         ),
     ),
     (
-        r"[V-Zv-z](.*)",
+        r'[V-Zv-z](.*)',
         (
             "It is a matter of life and death, a road either to safety or to ruin.",
             "Hold out baits to entice the enemy. Feign disorder, and crush him.",
@@ -117,7 +118,7 @@ pairs = (
             "Just as water retains no constant shape, so in warfare there are no constant conditions.",
         ),
     ),
-    (r"(.*)", ("Your statement insults me.", "")),
+    (r'(.*)', ("Your statement insults me.", "")),
 )
 
 suntsu_chatbot = Chat(pairs, reflections)
@@ -126,7 +127,7 @@ suntsu_chatbot = Chat(pairs, reflections)
 def suntsu_chat():
     print("Talk to the program by typing in plain English, using normal upper-")
     print('and lower-case letters and punctuation.  Enter "quit" when done.')
-    print("=" * 72)
+    print('=' * 72)
     print("You seek enlightenment?")
 
     suntsu_chatbot.converse()
index 8f4ec5d..f2dd361 100644 (file)
@@ -1,16 +1,19 @@
 # Natural Language Toolkit: Chatbot Utilities
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Authors: Steven Bird <stevenbird1@gmail.com>
 # URL: <http://nltk.org/>
 # For license information, see LICENSE.TXT
 
 # Based on an Eliza implementation by Joe Strout <joe@strout.net>,
 # Jeff Epler <jepler@inetnebr.com> and Jez Higgins <jez@jezuk.co.uk>.
+from __future__ import print_function
 
 import re
 import random
 
+from six.moves import input
+
 
 reflections = {
     "i am": "you are",
@@ -54,7 +57,7 @@ class Chat(object):
         self._regex = self._compile_reflections()
 
     def _compile_reflections(self):
-        sorted_refl = sorted(self._reflections, key=len, reverse=True)
+        sorted_refl = sorted(self._reflections.keys(), key=len, reverse=True)
         return re.compile(
             r"\b({0})\b".format("|".join(map(re.escape, sorted_refl))), re.IGNORECASE
         )
@@ -74,7 +77,7 @@ class Chat(object):
         )
 
     def _wildcards(self, response, match):
-        pos = response.find("%")
+        pos = response.find('%')
         while pos >= 0:
             num = int(response[pos + 1 : pos + 2])
             response = (
@@ -82,7 +85,7 @@ class Chat(object):
                 + self._substitute(match.group(num))
                 + response[pos + 2 :]
             )
-            pos = response.find("%")
+            pos = response.find('%')
         return response
 
     def respond(self, str):
@@ -104,10 +107,10 @@ class Chat(object):
                 resp = self._wildcards(resp, match)  # process wildcards
 
                 # fix munged punctuation at the end
-                if resp[-2:] == "?.":
-                    resp = resp[:-2] + "."
-                if resp[-2:] == "??":
-                    resp = resp[:-2] + "?"
+                if resp[-2:] == '?.':
+                    resp = resp[:-2] + '.'
+                if resp[-2:] == '??':
+                    resp = resp[:-2] + '?'
                 return resp
 
     # Hold a conversation with a chatbot
index 2ae944b..d46a9f9 100644 (file)
@@ -1,6 +1,6 @@
 # Natural Language Toolkit: Zen Chatbot
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Amy Holland <amyrh@csse.unimelb.edu.au>
 # URL: <http://nltk.org/>
 # For license information, see LICENSE.TXT
@@ -35,6 +35,7 @@ responses are very limited. Zen Chatbot will usually answer very vaguely, or
 respond to a question by asking a different question, in much the same way
 as Eliza.
 """
+from __future__ import print_function
 
 from nltk.chat.util import Chat, reflections
 
@@ -46,7 +47,7 @@ responses = (
     # "good day" etc, but also "good grief!"  and other sentences starting
     # with the word 'good' that may not be a greeting
     (
-        r"(hello(.*))|(good [a-zA-Z]+)",
+        r'(hello(.*))|(good [a-zA-Z]+)',
         (
             "The path to enlightenment is often difficult to see.",
             "Greetings. I sense your mind is troubled. Tell me of your troubles.",
@@ -64,7 +65,7 @@ responses = (
     # interpretation only makes sense for some inputs
     #
     (
-        r"i need (.*)",
+        r'i need (.*)',
         (
             "%1 can be achieved by hard work and dedication of the mind.",
             "%1 is not a need, but a desire of the mind. Clear your mind of such concerns.",
@@ -72,7 +73,7 @@ responses = (
         ),
     ),
     (
-        r"i want (.*)",
+        r'i want (.*)',
         (
             "Desires of the heart will distract you from the path to enlightenment.",
             "Will%1 help you attain enlightenment?",
@@ -88,27 +89,27 @@ responses = (
     #     chatbot: "Are you sure I tell you?"
     # - this style works for positives (e.g. "why do you like cake?")
     #   but does not work for negatives (e.g. "why don't you like cake?")
-    (r"why (.*) i (.*)\?", ("You%1%2?", "Perhaps you only think you%1%2")),
-    (r"why (.*) you(.*)\?", ("Why%1 you%2?", "%2 I%1", "Are you sure I%2?")),
-    (r"why (.*)\?", ("I cannot tell you why%1.", "Why do you think %1?")),
+    (r'why (.*) i (.*)\?', ("You%1%2?", "Perhaps you only think you%1%2")),
+    (r'why (.*) you(.*)\?', ("Why%1 you%2?", "%2 I%1", "Are you sure I%2?")),
+    (r'why (.*)\?', ("I cannot tell you why%1.", "Why do you think %1?")),
     # e.g. "are you listening?", "are you a duck"
     (
-        r"are you (.*)\?",
+        r'are you (.*)\?',
         ("Maybe%1, maybe not%1.", "Whether I am%1 or not is God's business."),
     ),
     # e.g. "am I a duck?", "am I going to die?"
     (
-        r"am i (.*)\?",
+        r'am i (.*)\?',
         ("Perhaps%1, perhaps not%1.", "Whether you are%1 or not is not for me to say."),
     ),
     # what questions, e.g. "what time is it?"
     # problems:
     #     person:  "What do you want?"
     #    chatbot: "Seek truth, not what do me want."
-    (r"what (.*)\?", ("Seek truth, not what%1.", "What%1 should not concern you.")),
+    (r'what (.*)\?', ("Seek truth, not what%1.", "What%1 should not concern you.")),
     # how questions, e.g. "how do you do?"
     (
-        r"how (.*)\?",
+        r'how (.*)\?',
         (
             "How do you suppose?",
             "Will an answer to that really help in your search for enlightenment?",
@@ -117,7 +118,7 @@ responses = (
     ),
     # can questions, e.g. "can you run?", "can you come over here please?"
     (
-        r"can you (.*)\?",
+        r'can you (.*)\?',
         (
             "I probably can, but I may not.",
             "Maybe I can%1, and maybe I cannot.",
@@ -126,7 +127,7 @@ responses = (
     ),
     # can questions, e.g. "can I have some cake?", "can I know truth?"
     (
-        r"can i (.*)\?",
+        r'can i (.*)\?',
         (
             "You can%1 if you believe you can%1, and have a pure spirit.",
             "Seek truth and you will know if you can%1.",
@@ -134,7 +135,7 @@ responses = (
     ),
     # e.g. "It is raining" - implies the speaker is certain of a fact
     (
-        r"it is (.*)",
+        r'it is (.*)',
         (
             "How can you be certain that%1, when you do not even know yourself?",
             "Whether it is%1 or not does not change the way the world is.",
@@ -142,14 +143,14 @@ responses = (
     ),
     # e.g. "is there a doctor in the house?"
     (
-        r"is there (.*)\?",
+        r'is there (.*)\?',
         ("There is%1 if you believe there is.", "It is possible that there is%1."),
     ),
     # e.g. "is it possible?", "is this true?"
-    (r"is(.*)\?", ("%1 is not relevant.", "Does this matter?")),
+    (r'is(.*)\?', ("%1 is not relevant.", "Does this matter?")),
     # non-specific question
     (
-        r"(.*)\?",
+        r'(.*)\?',
         (
             "Do you think %1?",
             "You seek the truth. Does the truth seek you?",
@@ -159,7 +160,7 @@ responses = (
     ),
     # expression of hate of form "I hate you" or "Kelly hates cheese"
     (
-        r"(.*) (hate[s]?)|(dislike[s]?)|(don\'t like)(.*)",
+        r'(.*) (hate[s]?)|(dislike[s]?)|(don\'t like)(.*)',
         (
             "Perhaps it is not about hating %2, but about hate from within.",
             "Weeds only grow when we dislike them",
@@ -168,7 +169,7 @@ responses = (
     ),
     # statement containing the word 'truth'
     (
-        r"(.*) truth(.*)",
+        r'(.*) truth(.*)',
         (
             "Seek truth, and truth will seek you.",
             "Remember, it is not the spoon which bends - only yourself.",
@@ -178,13 +179,13 @@ responses = (
     # desire to do an action
     # e.g. "I want to go shopping"
     (
-        r"i want to (.*)",
+        r'i want to (.*)',
         ("You may %1 if your heart truly desires to.", "You may have to %1."),
     ),
     # desire for an object
     # e.g. "I want a pony"
     (
-        r"i want (.*)",
+        r'i want (.*)',
         (
             "Does your heart truly desire %1?",
             "Is this a desire of the heart, or of the mind?",
@@ -192,7 +193,7 @@ responses = (
     ),
     # e.g. "I can't wait" or "I can't do this"
     (
-        r"i can\'t (.*)",
+        r'i can\'t (.*)',
         (
             "What we can and can't do is a limitation of the mind.",
             "There are limitations of the body, and limitations of the mind.",
@@ -203,7 +204,7 @@ responses = (
     # problem: exceptions...
     # e.g. "I think, therefore I am"
     (
-        r"i think (.*)",
+        r'i think (.*)',
         (
             "Uncertainty in an uncertain world.",
             "Indeed, how can we be certain of anything in such uncertain times.",
@@ -212,7 +213,7 @@ responses = (
     ),
     # "I feel...emotions/sick/light-headed..."
     (
-        r"i feel (.*)",
+        r'i feel (.*)',
         (
             "Your body and your emotions are both symptoms of your mind."
             "What do you believe is the root of such feelings?",
@@ -222,7 +223,7 @@ responses = (
     # exclaimation mark indicating emotion
     # e.g. "Wow!" or "No!"
     (
-        r"(.*)!",
+        r'(.*)!',
         (
             "I sense that you are feeling emotional today.",
             "You need to calm your emotions.",
@@ -231,7 +232,7 @@ responses = (
     # because [statement]
     # e.g. "because I said so"
     (
-        r"because (.*)",
+        r'because (.*)',
         (
             "Does knowning the reasons behind things help you to understand"
             " the things themselves?",
@@ -240,7 +241,7 @@ responses = (
     ),
     # yes or no - raise an issue of certainty/correctness
     (
-        r"(yes)|(no)",
+        r'(yes)|(no)',
         (
             "Is there certainty in an uncertain world?",
             "It is better to be right than to be certain.",
@@ -248,7 +249,7 @@ responses = (
     ),
     # sentence containing word 'love'
     (
-        r"(.*)love(.*)",
+        r'(.*)love(.*)',
         (
             "Think of the trees: they let the birds perch and fly with no intention to call them when they come, and no longing for their return when they fly away. Let your heart be like the trees.",
             "Free love!",
@@ -256,7 +257,7 @@ responses = (
     ),
     # sentence containing word 'understand' - r
     (
-        r"(.*)understand(.*)",
+        r'(.*)understand(.*)',
         (
             "If you understand, things are just as they are;"
             " if you do not understand, things are just as they are.",
@@ -266,7 +267,7 @@ responses = (
     # 'I', 'me', 'my' - person is talking about themself.
     # this breaks down when words contain these - eg 'Thyme', 'Irish'
     (
-        r"(.*)(me )|( me)|(my)|(mine)|(i)(.*)",
+        r'(.*)(me )|( me)|(my)|(mine)|(i)(.*)',
         (
             "'I', 'me', 'my'... these are selfish expressions.",
             "Have you ever considered that you might be a selfish person?",
@@ -277,12 +278,12 @@ responses = (
     # 'you' starting a sentence
     # e.g. "you stink!"
     (
-        r"you (.*)",
+        r'you (.*)',
         ("My path is not of conern to you.", "I am but one, and you but one more."),
     ),
     # say goodbye with some extra Zen wisdom.
     (
-        r"exit",
+        r'exit',
         (
             "Farewell. The obstacle is the path.",
             "Farewell. Life is a journey, not a destination.",
@@ -294,7 +295,7 @@ responses = (
     # when stumped, respond with generic zen wisdom
     #
     (
-        r"(.*)",
+        r'(.*)',
         (
             "When you're enlightened, every word is wisdom.",
             "Random talk is useless.",
@@ -309,13 +310,13 @@ zen_chatbot = Chat(responses, reflections)
 
 
 def zen_chat():
-    print("*" * 75)
+    print('*' * 75)
     print("Zen Chatbot!".center(75))
-    print("*" * 75)
+    print('*' * 75)
     print('"Look beyond mere words and letters - look into your mind"'.center(75))
     print("* Talk your way to truth with Zen Chatbot.")
     print("* Type 'quit' when you have had enough.")
-    print("*" * 75)
+    print('*' * 75)
     print("Welcome, my child.")
 
     zen_chatbot.converse()
index 3ec1e19..f4b107c 100644 (file)
@@ -1,6 +1,6 @@
 # Natural Language Toolkit: Chunkers
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Steven Bird <stevenbird1@gmail.com>
 #         Edward Loper <edloper@gmail.com>
 # URL: <http://nltk.org/>
@@ -169,8 +169,8 @@ from nltk.chunk.util import (
 from nltk.chunk.regexp import RegexpChunkParser, RegexpParser
 
 # Standard treebank POS tagger
-_BINARY_NE_CHUNKER = "chunkers/maxent_ne_chunker/english_ace_binary.pickle"
-_MULTICLASS_NE_CHUNKER = "chunkers/maxent_ne_chunker/english_ace_multiclass.pickle"
+_BINARY_NE_CHUNKER = 'chunkers/maxent_ne_chunker/english_ace_binary.pickle'
+_MULTICLASS_NE_CHUNKER = 'chunkers/maxent_ne_chunker/english_ace_multiclass.pickle'
 
 
 def ne_chunk(tagged_tokens, binary=False):
index 9e922d0..a88b3e6 100644 (file)
Binary files a/nlp_resource_data/nltk/chunk/__pycache__/__init__.cpython-37.pyc and b/nlp_resource_data/nltk/chunk/__pycache__/__init__.cpython-37.pyc differ
index f9acf6d..c7a8346 100644 (file)
Binary files a/nlp_resource_data/nltk/chunk/__pycache__/api.cpython-37.pyc and b/nlp_resource_data/nltk/chunk/__pycache__/api.cpython-37.pyc differ
index fa26906..5d5dbf2 100644 (file)
Binary files a/nlp_resource_data/nltk/chunk/__pycache__/named_entity.cpython-37.pyc and b/nlp_resource_data/nltk/chunk/__pycache__/named_entity.cpython-37.pyc differ
index 48374fb..b4077df 100644 (file)
Binary files a/nlp_resource_data/nltk/chunk/__pycache__/regexp.cpython-37.pyc and b/nlp_resource_data/nltk/chunk/__pycache__/regexp.cpython-37.pyc differ
index f9a8212..2d7dfcc 100644 (file)
Binary files a/nlp_resource_data/nltk/chunk/__pycache__/util.cpython-37.pyc and b/nlp_resource_data/nltk/chunk/__pycache__/util.cpython-37.pyc differ
index 1e4af77..1454825 100644 (file)
@@ -1,6 +1,6 @@
 # Natural Language Toolkit: Chunk parsing API
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Edward Loper <edloper@gmail.com>
 #         Steven Bird <stevenbird1@gmail.com> (minor additions)
 # URL: <http://nltk.org/>
index 863ee99..07d3067 100644 (file)
@@ -1,6 +1,6 @@
 # Natural Language Toolkit: Chunk parsing API
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Edward Loper <edloper@gmail.com>
 # URL: <http://nltk.org/>
 # For license information, see LICENSE.TXT
@@ -8,6 +8,8 @@
 """
 Named entity chunker
 """
+from __future__ import print_function
+from __future__ import unicode_literals
 
 import os, re, pickle
 from xml.etree import ElementTree as ET
@@ -39,7 +41,7 @@ class NEChunkParserTagger(ClassifierBasedTagger):
 
     def _classifier_builder(self, train):
         return MaxentClassifier.train(
-            train, algorithm="megam", gaussian_prior_sigma=1, trace=2
+            train, algorithm='megam', gaussian_prior_sigma=1, trace=2
         )
 
     def _english_wordlist(self):
@@ -48,7 +50,7 @@ class NEChunkParserTagger(ClassifierBasedTagger):
         except AttributeError:
             from nltk.corpus import words
 
-            self._en_wordlist = set(words.words("en-basic"))
+            self._en_wordlist = set(words.words('en-basic'))
             wl = self._en_wordlist
         return wl
 
@@ -90,22 +92,22 @@ class NEChunkParserTagger(ClassifierBasedTagger):
 
         # 89.6
         features = {
-            "bias": True,
-            "shape": shape(word),
-            "wordlen": len(word),
-            "prefix3": word[:3].lower(),
-            "suffix3": word[-3:].lower(),
-            "pos": pos,
-            "word": word,
-            "en-wordlist": (word in self._english_wordlist()),
-            "prevtag": prevtag,
-            "prevpos": prevpos,
-            "nextpos": nextpos,
-            "prevword": prevword,
-            "nextword": nextword,
-            "word+nextpos": "{0}+{1}".format(word.lower(), nextpos),
-            "pos+prevtag": "{0}+{1}".format(pos, prevtag),
-            "shape+prevtag": "{0}+{1}".format(prevshape, prevtag),
+            'bias': True,
+            'shape': shape(word),
+            'wordlen': len(word),
+            'prefix3': word[:3].lower(),
+            'suffix3': word[-3:].lower(),
+            'pos': pos,
+            'word': word,
+            'en-wordlist': (word in self._english_wordlist()),
+            'prevtag': prevtag,
+            'prevpos': prevpos,
+            'nextpos': nextpos,
+            'prevword': prevword,
+            'nextword': nextword,
+            'word+nextpos': '{0}+{1}'.format(word.lower(), nextpos),
+            'pos+prevtag': '{0}+{1}'.format(pos, prevtag),
+            'shape+prevtag': '{0}+{1}'.format(prevshape, prevtag),
         }
 
         return features
@@ -137,14 +139,14 @@ class NEChunkParser(ChunkParserI):
         """
         Convert a list of tagged tokens to a chunk-parse tree.
         """
-        sent = Tree("S", [])
+        sent = Tree('S', [])
 
         for (tok, tag) in tagged_tokens:
-            if tag == "O":
+            if tag == 'O':
                 sent.append(tok)
-            elif tag.startswith("B-"):
+            elif tag.startswith('B-'):
                 sent.append(Tree(tag[2:], [tok]))
-            elif tag.startswith("I-"):
+            elif tag.startswith('I-'):
                 if sent and isinstance(sent[-1], Tree) and sent[-1].label() == tag[2:]:
                     sent[-1].append(tok)
                 else:
@@ -162,42 +164,42 @@ class NEChunkParser(ChunkParserI):
                 if len(child) == 0:
                     print("Warning -- empty chunk in sentence")
                     continue
-                toks.append((child[0], "B-{0}".format(child.label())))
+                toks.append((child[0], 'B-{0}'.format(child.label())))
                 for tok in child[1:]:
-                    toks.append((tok, "I-{0}".format(child.label())))
+                    toks.append((tok, 'I-{0}'.format(child.label())))
             else:
-                toks.append((child, "O"))
+                toks.append((child, 'O'))
         return toks
 
 
 def shape(word):
-    if re.match("[0-9]+(\.[0-9]*)?|[0-9]*\.[0-9]+$", word, re.UNICODE):
-        return "number"
-    elif re.match("\W+$", word, re.UNICODE):
-        return "punct"
-    elif re.match("\w+$", word, re.UNICODE):
+    if re.match('[0-9]+(\.[0-9]*)?|[0-9]*\.[0-9]+$', word, re.UNICODE):
+        return 'number'
+    elif re.match('\W+$', word, re.UNICODE):
+        return 'punct'
+    elif re.match('\w+$', word, re.UNICODE):
         if word.istitle():
-            return "upcase"
+            return 'upcase'
         elif word.islower():
-            return "downcase"
+            return 'downcase'
         else:
-            return "mixedcase"
+            return 'mixedcase'
     else:
-        return "other"
+        return 'other'
 
 
 def simplify_pos(s):
-    if s.startswith("V"):
+    if s.startswith('V'):
         return "V"
     else:
-        return s.split("-")[0]
+        return s.split('-')[0]
 
 
 def postag_tree(tree):
     # Part-of-speech tagging.
     words = tree.leaves()
     tag_iter = (pos for (word, pos) in pos_tag(words))
-    newtree = Tree("S", [])
+    newtree = Tree('S', [])
     for child in tree:
         if isinstance(child, Tree):
             newtree.append(Tree(child.label(), []))
@@ -208,47 +210,47 @@ def postag_tree(tree):
     return newtree
 
 
-def load_ace_data(roots, fmt="binary", skip_bnews=True):
+def load_ace_data(roots, fmt='binary', skip_bnews=True):
     for root in roots:
         for root, dirs, files in os.walk(root):
-            if root.endswith("bnews") and skip_bnews:
+            if root.endswith('bnews') and skip_bnews:
                 continue
             for f in files:
-                if f.endswith(".sgm"):
+                if f.endswith('.sgm'):
                     for sent in load_ace_file(os.path.join(root, f), fmt):
                         yield sent
 
 
 def load_ace_file(textfile, fmt):
-    print("  - {0}".format(os.path.split(textfile)[1]))
-    annfile = textfile + ".tmx.rdc.xml"
+    print('  - {0}'.format(os.path.split(textfile)[1]))
+    annfile = textfile + '.tmx.rdc.xml'
 
     # Read the xml file, and get a list of entities
     entities = []
-    with open(annfile, "r") as infile:
+    with open(annfile, 'r') as infile:
         xml = ET.parse(infile).getroot()
-    for entity in xml.findall("document/entity"):
-        typ = entity.find("entity_type").text
-        for mention in entity.findall("entity_mention"):
-            if mention.get("TYPE") != "NAME":
+    for entity in xml.findall('document/entity'):
+        typ = entity.find('entity_type').text
+        for mention in entity.findall('entity_mention'):
+            if mention.get('TYPE') != 'NAME':
                 continue  # only NEs
-            s = int(mention.find("head/charseq/start").text)
-            e = int(mention.find("head/charseq/end").text) + 1
+            s = int(mention.find('head/charseq/start').text)
+            e = int(mention.find('head/charseq/end').text) + 1
             entities.append((s, e, typ))
 
     # Read the text file, and mark the entities.
-    with open(textfile, "r") as infile:
+    with open(textfile, 'r') as infile:
         text = infile.read()
 
     # Strip XML tags, since they don't count towards the indices
-    text = re.sub("<(?!/?TEXT)[^>]+>", "", text)
+    text = re.sub('<(?!/?TEXT)[^>]+>', '', text)
 
     # Blank out anything before/after <TEXT>
     def subfunc(m):
-        return " " * (m.end() - m.start() - 6)
+        return ' ' * (m.end() - m.start() - 6)
 
-    text = re.sub("[\s\S]*<TEXT>", subfunc, text)
-    text = re.sub("</TEXT>[\s\S]*", "", text)
+    text = re.sub('[\s\S]*<TEXT>', subfunc, text)
+    text = re.sub('</TEXT>[\s\S]*', '', text)
 
     # Simplify quotes
     text = re.sub("``", ' "', text)
@@ -257,24 +259,24 @@ def load_ace_file(textfile, fmt):
     entity_types = set(typ for (s, e, typ) in entities)
 
     # Binary distinction (NE or not NE)
-    if fmt == "binary":
+    if fmt == 'binary':
         i = 0
-        toks = Tree("S", [])
+        toks = Tree('S', [])
         for (s, e, typ) in sorted(entities):
             if s < i:
                 s = i  # Overlapping!  Deal with this better?
             if e <= s:
                 continue
             toks.extend(word_tokenize(text[i:s]))
-            toks.append(Tree("NE", text[s:e].split()))
+            toks.append(Tree('NE', text[s:e].split()))
             i = e
         toks.extend(word_tokenize(text[i:]))
         yield toks
 
     # Multiclass distinction (NE type)
-    elif fmt == "multiclass":
+    elif fmt == 'multiclass':
         i = 0
-        toks = Tree("S", [])
+        toks = Tree('S', [])
         for (s, e, typ) in sorted(entities):
             if s < i:
                 s = i  # Overlapping!  Deal with this better?
@@ -287,7 +289,7 @@ def load_ace_file(textfile, fmt):
         yield toks
 
     else:
-        raise ValueError("bad fmt value")
+        raise ValueError('bad fmt value')
 
 
 # This probably belongs in a more general-purpose location (as does
@@ -297,36 +299,36 @@ def cmp_chunks(correct, guessed):
     guessed = NEChunkParser._parse_to_tagged(guessed)
     ellipsis = False
     for (w, ct), (w, gt) in zip(correct, guessed):
-        if ct == gt == "O":
+        if ct == gt == 'O':
             if not ellipsis:
                 print("  {:15} {:15} {2}".format(ct, gt, w))
-                print("  {:15} {:15} {2}".format("...", "...", "..."))
+                print('  {:15} {:15} {2}'.format('...', '...', '...'))
                 ellipsis = True
         else:
             ellipsis = False
             print("  {:15} {:15} {2}".format(ct, gt, w))
 
 
-def build_model(fmt="binary"):
-    print("Loading training data...")
+def build_model(fmt='binary'):
+    print('Loading training data...')
     train_paths = [
-        find("corpora/ace_data/ace.dev"),
-        find("corpora/ace_data/ace.heldout"),
-        find("corpora/ace_data/bbn.dev"),
-        find("corpora/ace_data/muc.dev"),
+        find('corpora/ace_data/ace.dev'),
+        find('corpora/ace_data/ace.heldout'),
+        find('corpora/ace_data/bbn.dev'),
+        find('corpora/ace_data/muc.dev'),
     ]
     train_trees = load_ace_data(train_paths, fmt)
     train_data = [postag_tree(t) for t in train_trees]
-    print("Training...")
+    print('Training...')
     cp = NEChunkParser(train_data)
     del train_data
 
-    print("Loading eval data...")
-    eval_paths = [find("corpora/ace_data/ace.eval")]
+    print('Loading eval data...')
+    eval_paths = [find('corpora/ace_data/ace.eval')]
     eval_trees = load_ace_data(eval_paths, fmt)
     eval_data = [postag_tree(t) for t in eval_trees]
 
-    print("Evaluating...")
+    print('Evaluating...')
     chunkscore = ChunkScore()
     for i, correct in enumerate(eval_data):
         guess = cp.parse(correct.leaves())
@@ -335,18 +337,18 @@ def build_model(fmt="binary"):
             cmp_chunks(correct, guess)
     print(chunkscore)
 
-    outfilename = "/tmp/ne_chunker_{0}.pickle".format(fmt)
-    print("Saving chunker to {0}...".format(outfilename))
+    outfilename = '/tmp/ne_chunker_{0}.pickle'.format(fmt)
+    print('Saving chunker to {0}...'.format(outfilename))
 
-    with open(outfilename, "wb") as outfile:
+    with open(outfilename, 'wb') as outfile:
         pickle.dump(cp, outfile, -1)
 
     return cp
 
 
-if __name__ == "__main__":
+if __name__ == '__main__':
     # Make sure that the pickled object has the right class name:
     from nltk.chunk.named_entity import build_model
 
-    build_model("binary")
-    build_model("multiclass")
+    build_model('binary')
+    build_model('multiclass')
index f0e910c..fe4ab5b 100644 (file)
@@ -1,21 +1,27 @@
 # Natural Language Toolkit: Regular Expression Chunkers
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Edward Loper <edloper@gmail.com>
 #         Steven Bird <stevenbird1@gmail.com> (minor additions)
 # URL: <http://nltk.org/>
 # For license information, see LICENSE.TXT
+from __future__ import print_function, unicode_literals
+from __future__ import division
 
 import re
 
+from six import string_types
+
 from nltk.tree import Tree
 from nltk.chunk.api import ChunkParserI
+from nltk.compat import python_2_unicode_compatible, unicode_repr
 
 ##//////////////////////////////////////////////////////
 ##  ChunkString
 ##//////////////////////////////////////////////////////
 
 
+@python_2_unicode_compatible
 class ChunkString(object):
     """
     A string-based encoding of a particular chunking of a text.
@@ -54,18 +60,18 @@ class ChunkString(object):
         will only match positions that are in chinks.
     """
 
-    CHUNK_TAG_CHAR = r"[^\{\}<>]"
-    CHUNK_TAG = r"(<%s+?>)" % CHUNK_TAG_CHAR
+    CHUNK_TAG_CHAR = r'[^\{\}<>]'
+    CHUNK_TAG = r'(<%s+?>)' % CHUNK_TAG_CHAR
 
-    IN_CHUNK_PATTERN = r"(?=[^\{]*\})"
-    IN_CHINK_PATTERN = r"(?=[^\}]*(\{|$))"
+    IN_CHUNK_PATTERN = r'(?=[^\{]*\})'
+    IN_CHINK_PATTERN = r'(?=[^\}]*(\{|$))'
 
     # These are used by _verify
-    _CHUNK = r"(\{%s+?\})+?" % CHUNK_TAG
-    _CHINK = r"(%s+?)+?" % CHUNK_TAG
-    _VALID = re.compile(r"^(\{?%s\}?)*?$" % CHUNK_TAG)
-    _BRACKETS = re.compile("[^\{\}]+")
-    _BALANCED_BRACKETS = re.compile(r"(\{\})*$")
+    _CHUNK = r'(\{%s+?\})+?' % CHUNK_TAG
+    _CHINK = r'(%s+?)+?' % CHUNK_TAG
+    _VALID = re.compile(r'^(\{?%s\}?)*?$' % CHUNK_TAG)
+    _BRACKETS = re.compile('[^\{\}]+')
+    _BALANCED_BRACKETS = re.compile(r'(\{\})*$')
 
     def __init__(self, chunk_struct, debug_level=1):
         """
@@ -91,7 +97,7 @@ class ChunkString(object):
         self._root_label = chunk_struct.label()
         self._pieces = chunk_struct[:]
         tags = [self._tag(tok) for tok in self._pieces]
-        self._str = "<" + "><".join(tags) + ">"
+        self._str = '<' + '><'.join(tags) + '>'
         self._debug = debug_level
 
     def _tag(self, tok):
@@ -100,7 +106,7 @@ class ChunkString(object):
         elif isinstance(tok, Tree):
             return tok.label()
         else:
-            raise ValueError("chunk structures must contain tagged " "tokens or trees")
+            raise ValueError('chunk structures must contain tagged ' 'tokens or trees')
 
     def _verify(self, s, verify_tags):
         """
@@ -121,31 +127,31 @@ class ChunkString(object):
         # Check overall form
         if not ChunkString._VALID.match(s):
             raise ValueError(
-                "Transformation generated invalid " "chunkstring:\n  %s" % s
+                'Transformation generated invalid ' 'chunkstring:\n  %s' % s
             )
 
         # Check that parens are balanced.  If the string is long, we
         # have to do this in pieces, to avoid a maximum recursion
         # depth limit for regular expressions.
-        brackets = ChunkString._BRACKETS.sub("", s)
+        brackets = ChunkString._BRACKETS.sub('', s)
         for i in range(1 + len(brackets) // 5000):
             substr = brackets[i * 5000 : i * 5000 + 5000]
             if not ChunkString._BALANCED_BRACKETS.match(substr):
                 raise ValueError(
-                    "Transformation generated invalid " "chunkstring:\n  %s" % s
+                    'Transformation generated invalid ' 'chunkstring:\n  %s' % s
                 )
 
         if verify_tags <= 0:
             return
 
-        tags1 = (re.split(r"[\{\}<>]+", s))[1:-1]
+        tags1 = (re.split(r'[\{\}<>]+', s))[1:-1]
         tags2 = [self._tag(piece) for piece in self._pieces]
         if tags1 != tags2:
             raise ValueError(
-                "Transformation generated invalid " "chunkstring: tag changed"
+                'Transformation generated invalid ' 'chunkstring: tag changed'
             )
 
-    def to_chunkstruct(self, chunk_label="CHUNK"):
+    def to_chunkstruct(self, chunk_label='CHUNK'):
         """
         Return the chunk structure encoded by this ``ChunkString``.
 
@@ -160,10 +166,10 @@ class ChunkString(object):
         pieces = []
         index = 0
         piece_in_chunk = 0
-        for piece in re.split("[{}]", self._str):
+        for piece in re.split('[{}]', self._str):
 
             # Find the list of tokens contained in this piece.
-            length = piece.count("<")
+            length = piece.count('<')
             subsequence = self._pieces[index : index + length]
 
             # Add this list of tokens to our pieces.
@@ -209,7 +215,7 @@ class ChunkString(object):
         # The substitution might have generated "empty chunks"
         # (substrings of the form "{}").  Remove them, so they don't
         # interfere with other transformations.
-        s = re.sub("\{\}", "", s)
+        s = re.sub('\{\}', '', s)
 
         # Make sure that the transformation was legal.
         if self._debug > 1:
@@ -227,7 +233,7 @@ class ChunkString(object):
 
         :rtype: str
         """
-        return "<ChunkString: %s>" % repr(self._str)
+        return '<ChunkString: %s>' % unicode_repr(self._str)
 
     def __str__(self):
         """
@@ -239,10 +245,10 @@ class ChunkString(object):
        :rtype: str
         """
         # Add spaces to make everything line up.
-        str = re.sub(r">(?!\})", r"> ", self._str)
-        str = re.sub(r"([^\{])<", r"\1 <", str)
-        if str[0] == "<":
-            str = " " + str
+        str = re.sub(r'>(?!\})', r'> ', self._str)
+        str = re.sub(r'([^\{])<', r'\1 <', str)
+        if str[0] == '<':
+            str = ' ' + str
         return str
 
 
@@ -251,6 +257,7 @@ class ChunkString(object):
 ##//////////////////////////////////////////////////////
 
 
+@python_2_unicode_compatible
 class RegexpChunkRule(object):
     """
     A rule specifying how to modify the chunking in a ``ChunkString``,
@@ -295,7 +302,7 @@ class RegexpChunkRule(object):
         :param descr: A short description of the purpose and/or effect
             of this rule.
         """
-        if isinstance(regexp, str):
+        if isinstance(regexp, string_types):
             regexp = re.compile(regexp)
         self._repl = repl
         self._descr = descr
@@ -338,11 +345,11 @@ class RegexpChunkRule(object):
         :rtype: str
         """
         return (
-            "<RegexpChunkRule: "
-            + repr(self._regexp.pattern)
-            + "->"
-            + repr(self._repl)
-            + ">"
+            '<RegexpChunkRule: '
+            + unicode_repr(self._regexp.pattern)
+            + '->'
+            + unicode_repr(self._repl)
+            + '>'
         )
 
     @staticmethod
@@ -365,33 +372,34 @@ class RegexpChunkRule(object):
         <ChunkRule: '<DT>?<NN.*>+'>
         """
         # Split off the comment (but don't split on '\#')
-        m = re.match(r"(?P<rule>(\\.|[^#])*)(?P<comment>#.*)?", s)
-        rule = m.group("rule").strip()
-        comment = (m.group("comment") or "")[1:].strip()
+        m = re.match(r'(?P<rule>(\\.|[^#])*)(?P<comment>#.*)?', s)
+        rule = m.group('rule').strip()
+        comment = (m.group('comment') or '')[1:].strip()
 
         # Pattern bodies: chunk, chink, split, merge
         try:
             if not rule:
-                raise ValueError("Empty chunk pattern")
-            if rule[0] == "{" and rule[-1] == "}":
+                raise ValueError('Empty chunk pattern')
+            if rule[0] == '{' and rule[-1] == '}':
                 return ChunkRule(rule[1:-1], comment)
-            elif rule[0] == "}" and rule[-1] == "{":
+            elif rule[0] == '}' and rule[-1] == '{':
                 return ChinkRule(rule[1:-1], comment)
-            elif "}{" in rule:
-                left, right = rule.split("}{")
+            elif '}{' in rule:
+                left, right = rule.split('}{')
                 return SplitRule(left, right, comment)
-            elif "{}" in rule:
-                left, right = rule.split("{}")
+            elif '{}' in rule:
+                left, right = rule.split('{}')
                 return MergeRule(left, right, comment)
-            elif re.match("[^{}]*{[^{}]*}[^{}]*", rule):
-                left, chunk, right = re.split("[{}]", rule)
+            elif re.match('[^{}]*{[^{}]*}[^{}]*', rule):
+                left, chunk, right = re.split('[{}]', rule)
                 return ChunkRuleWithContext(left, chunk, right, comment)
             else:
-                raise ValueError("Illegal chunk pattern: %s" % rule)
+                raise ValueError('Illegal chunk pattern: %s' % rule)
         except (ValueError, re.error):
-            raise ValueError("Illegal chunk pattern: %s" % rule)
+            raise ValueError('Illegal chunk pattern: %s' % rule)
 
 
+@python_2_unicode_compatible
 class ChunkRule(RegexpChunkRule):
     """
     A rule specifying how to add chunks to a ``ChunkString``, using a
@@ -417,10 +425,10 @@ class ChunkRule(RegexpChunkRule):
         """
         self._pattern = tag_pattern
         regexp = re.compile(
-            "(?P<chunk>%s)%s"
+            '(?P<chunk>%s)%s'
             % (tag_pattern2re_pattern(tag_pattern), ChunkString.IN_CHINK_PATTERN)
         )
-        RegexpChunkRule.__init__(self, regexp, "{\g<chunk>}", descr)
+        RegexpChunkRule.__init__(self, regexp, '{\g<chunk>}', descr)
 
     def __repr__(self):
         """
@@ -434,9 +442,10 @@ class ChunkRule(RegexpChunkRule):
 
         :rtype: str
         """
-        return "<ChunkRule: " + repr(self._pattern) + ">"
+        return '<ChunkRule: ' + unicode_repr(self._pattern) + '>'
 
 
+@python_2_unicode_compatible
 class ChinkRule(RegexpChunkRule):
     """
     A rule specifying how to remove chinks to a ``ChunkString``,
@@ -462,10 +471,10 @@ class ChinkRule(RegexpChunkRule):
         """
         self._pattern = tag_pattern
         regexp = re.compile(
-            "(?P<chink>%s)%s"
+            '(?P<chink>%s)%s'
             % (tag_pattern2re_pattern(tag_pattern), ChunkString.IN_CHUNK_PATTERN)
         )
-        RegexpChunkRule.__init__(self, regexp, "}\g<chink>{", descr)
+        RegexpChunkRule.__init__(self, regexp, '}\g<chink>{', descr)
 
     def __repr__(self):
         """
@@ -479,9 +488,10 @@ class ChinkRule(RegexpChunkRule):
 
         :rtype: str
         """
-        return "<ChinkRule: " + repr(self._pattern) + ">"
+        return '<ChinkRule: ' + unicode_repr(self._pattern) + '>'
 
 
+@python_2_unicode_compatible
 class UnChunkRule(RegexpChunkRule):
     """
     A rule specifying how to remove chunks to a ``ChunkString``,
@@ -504,8 +514,8 @@ class UnChunkRule(RegexpChunkRule):
             of this rule.
         """
         self._pattern = tag_pattern
-        regexp = re.compile("\{(?P<chunk>%s)\}" % tag_pattern2re_pattern(tag_pattern))
-        RegexpChunkRule.__init__(self, regexp, "\g<chunk>", descr)
+        regexp = re.compile('\{(?P<chunk>%s)\}' % tag_pattern2re_pattern(tag_pattern))
+        RegexpChunkRule.__init__(self, regexp, '\g<chunk>', descr)
 
     def __repr__(self):
         """
@@ -519,9 +529,10 @@ class UnChunkRule(RegexpChunkRule):
 
         :rtype: str
         """
-        return "<UnChunkRule: " + repr(self._pattern) + ">"
+        return '<UnChunkRule: ' + unicode_repr(self._pattern) + '>'
 
 
+@python_2_unicode_compatible
 class MergeRule(RegexpChunkRule):
     """
     A rule specifying how to merge chunks in a ``ChunkString``, using
@@ -563,13 +574,13 @@ class MergeRule(RegexpChunkRule):
         self._left_tag_pattern = left_tag_pattern
         self._right_tag_pattern = right_tag_pattern
         regexp = re.compile(
-            "(?P<left>%s)}{(?=%s)"
+            '(?P<left>%s)}{(?=%s)'
             % (
                 tag_pattern2re_pattern(left_tag_pattern),
                 tag_pattern2re_pattern(right_tag_pattern),
             )
         )
-        RegexpChunkRule.__init__(self, regexp, "\g<left>", descr)
+        RegexpChunkRule.__init__(self, regexp, '\g<left>', descr)
 
     def __repr__(self):
         """
@@ -584,14 +595,15 @@ class MergeRule(RegexpChunkRule):
         :rtype: str
         """
         return (
-            "<MergeRule: "
-            + repr(self._left_tag_pattern)
-            + ", "
-            + repr(self._right_tag_pattern)
-            + ">"
+            '<MergeRule: '
+            + unicode_repr(self._left_tag_pattern)
+            + ', '
+            + unicode_repr(self._right_tag_pattern)
+            + '>'
         )
 
 
+@python_2_unicode_compatible
 class SplitRule(RegexpChunkRule):
     """
     A rule specifying how to split chunks in a ``ChunkString``, using
@@ -632,13 +644,13 @@ class SplitRule(RegexpChunkRule):
         self._left_tag_pattern = left_tag_pattern
         self._right_tag_pattern = right_tag_pattern
         regexp = re.compile(
-            "(?P<left>%s)(?=%s)"
+            '(?P<left>%s)(?=%s)'
             % (
                 tag_pattern2re_pattern(left_tag_pattern),
                 tag_pattern2re_pattern(right_tag_pattern),
             )
         )
-        RegexpChunkRule.__init__(self, regexp, r"\g<left>}{", descr)
+        RegexpChunkRule.__init__(self, regexp, r'\g<left>}{', descr)
 
     def __repr__(self):
         """
@@ -653,14 +665,15 @@ class SplitRule(RegexpChunkRule):
        :rtype: str
         """
         return (
-            "<SplitRule: "
-            + repr(self._left_tag_pattern)
-            + ", "
-            + repr(self._right_tag_pattern)
-            + ">"
+            '<SplitRule: '
+            + unicode_repr(self._left_tag_pattern)
+            + ', '
+            + unicode_repr(self._right_tag_pattern)
+            + '>'
         )
 
 
+@python_2_unicode_compatible
 class ExpandLeftRule(RegexpChunkRule):
     """
     A rule specifying how to expand chunks in a ``ChunkString`` to the left,
@@ -702,13 +715,13 @@ class ExpandLeftRule(RegexpChunkRule):
         self._left_tag_pattern = left_tag_pattern
         self._right_tag_pattern = right_tag_pattern
         regexp = re.compile(
-            "(?P<left>%s)\{(?P<right>%s)"
+            '(?P<left>%s)\{(?P<right>%s)'
             % (
                 tag_pattern2re_pattern(left_tag_pattern),
                 tag_pattern2re_pattern(right_tag_pattern),
             )
         )
-        RegexpChunkRule.__init__(self, regexp, "{\g<left>\g<right>", descr)
+        RegexpChunkRule.__init__(self, regexp, '{\g<left>\g<right>', descr)
 
     def __repr__(self):
         """
@@ -723,14 +736,15 @@ class ExpandLeftRule(RegexpChunkRule):
         :rtype: str
         """
         return (
-            "<ExpandLeftRule: "
-            + repr(self._left_tag_pattern)
-            + ", "
-            + repr(self._right_tag_pattern)
-            + ">"
+            '<ExpandLeftRule: '
+            + unicode_repr(self._left_tag_pattern)
+            + ', '
+            + unicode_repr(self._right_tag_pattern)
+            + '>'
         )
 
 
+@python_2_unicode_compatible
 class ExpandRightRule(RegexpChunkRule):
     """
     A rule specifying how to expand chunks in a ``ChunkString`` to the
@@ -772,13 +786,13 @@ class ExpandRightRule(RegexpChunkRule):
         self._left_tag_pattern = left_tag_pattern
         self._right_tag_pattern = right_tag_pattern
         regexp = re.compile(
-            "(?P<left>%s)\}(?P<right>%s)"
+            '(?P<left>%s)\}(?P<right>%s)'
             % (
                 tag_pattern2re_pattern(left_tag_pattern),
                 tag_pattern2re_pattern(right_tag_pattern),
             )
         )
-        RegexpChunkRule.__init__(self, regexp, "\g<left>\g<right>}", descr)
+        RegexpChunkRule.__init__(self, regexp, '\g<left>\g<right>}', descr)
 
     def __repr__(self):
         """
@@ -793,14 +807,15 @@ class ExpandRightRule(RegexpChunkRule):
         :rtype: str
         """
         return (
-            "<ExpandRightRule: "
-            + repr(self._left_tag_pattern)
-            + ", "
-            + repr(self._right_tag_pattern)
-            + ">"
+            '<ExpandRightRule: '
+            + unicode_repr(self._left_tag_pattern)
+            + ', '
+            + unicode_repr(self._right_tag_pattern)
+            + '>'
         )
 
 
+@python_2_unicode_compatible
 class ChunkRuleWithContext(RegexpChunkRule):
     """
     A rule specifying how to add chunks to a ``ChunkString``, using
@@ -853,7 +868,7 @@ class ChunkRuleWithContext(RegexpChunkRule):
         self._chunk_tag_pattern = chunk_tag_pattern
         self._right_context_tag_pattern = right_context_tag_pattern
         regexp = re.compile(
-            "(?P<left>%s)(?P<chunk>%s)(?P<right>%s)%s"
+            '(?P<left>%s)(?P<chunk>%s)(?P<right>%s)%s'
             % (
                 tag_pattern2re_pattern(left_context_tag_pattern),
                 tag_pattern2re_pattern(chunk_tag_pattern),
@@ -861,7 +876,7 @@ class ChunkRuleWithContext(RegexpChunkRule):
                 ChunkString.IN_CHINK_PATTERN,
             )
         )
-        replacement = r"\g<left>{\g<chunk>}\g<right>"
+        replacement = r'\g<left>{\g<chunk>}\g<right>'
         RegexpChunkRule.__init__(self, regexp, replacement, descr)
 
     def __repr__(self):
@@ -876,7 +891,7 @@ class ChunkRuleWithContext(RegexpChunkRule):
 
         :rtype: str
         """
-        return "<ChunkRuleWithContext:  %r, %r, %r>" % (
+        return '<ChunkRuleWithContext:  %r, %r, %r>' % (
             self._left_context_tag_pattern,
             self._chunk_tag_pattern,
             self._right_context_tag_pattern,
@@ -890,7 +905,7 @@ class ChunkRuleWithContext(RegexpChunkRule):
 # this should probably be made more strict than it is -- e.g., it
 # currently accepts 'foo'.
 CHUNK_TAG_PATTERN = re.compile(
-    r"^((%s|<%s>)*)$" % ("([^\{\}<>]|\{\d+,?\}|\{\d*,\d+\})+", "[^\{\}<>]+")
+    r'^((%s|<%s>)*)$' % ('([^\{\}<>]|\{\d+,?\}|\{\d*,\d+\})+', '[^\{\}<>]+')
 )
 
 
@@ -931,13 +946,13 @@ def tag_pattern2re_pattern(tag_pattern):
         ``tag_pattern``.
     """
     # Clean up the regular expression
-    tag_pattern = re.sub(r"\s", "", tag_pattern)
-    tag_pattern = re.sub(r"<", "(<(", tag_pattern)
-    tag_pattern = re.sub(r">", ")>)", tag_pattern)
+    tag_pattern = re.sub(r'\s', '', tag_pattern)
+    tag_pattern = re.sub(r'<', '(<(', tag_pattern)
+    tag_pattern = re.sub(r'>', ')>)', tag_pattern)
 
     # Check the regular expression
     if not CHUNK_TAG_PATTERN.match(tag_pattern):
-        raise ValueError("Bad tag pattern: %r" % tag_pattern)
+        raise ValueError('Bad tag pattern: %r' % tag_pattern)
 
     # Replace "." with CHUNK_TAG_CHAR.
     # We have to do this after, since it adds {}[]<>s, which would
@@ -948,11 +963,11 @@ def tag_pattern2re_pattern(tag_pattern):
     def reverse_str(str):
         lst = list(str)
         lst.reverse()
-        return "".join(lst)
+        return ''.join(lst)
 
     tc_rev = reverse_str(ChunkString.CHUNK_TAG_CHAR)
     reversed = reverse_str(tag_pattern)
-    reversed = re.sub(r"\.(?!\\(\\\\)*($|[^\\]))", tc_rev, reversed)
+    reversed = re.sub(r'\.(?!\\(\\\\)*($|[^\\]))', tc_rev, reversed)
     tag_pattern = reverse_str(reversed)
 
     return tag_pattern
@@ -963,6 +978,7 @@ def tag_pattern2re_pattern(tag_pattern):
 ##//////////////////////////////////////////////////////
 
 
+@python_2_unicode_compatible
 class RegexpChunkParser(ChunkParserI):
     """
     A regular expression based chunk parser.  ``RegexpChunkParser`` uses a
@@ -985,7 +1001,7 @@ class RegexpChunkParser(ChunkParserI):
 
     """
 
-    def __init__(self, rules, chunk_label="NP", root_label="S", trace=0):
+    def __init__(self, rules, chunk_label='NP', root_label='S', trace=0):
         """
         Construct a new ``RegexpChunkParser``.
 
@@ -1024,14 +1040,14 @@ class RegexpChunkParser(ChunkParserI):
         :param verbose: Whether output should be verbose.
         :rtype: None
         """
-        print("# Input:")
+        print('# Input:')
         print(chunkstr)
         for rule in self._rules:
             rule.apply(chunkstr)
             if verbose:
-                print("#", rule.descr() + " (" + repr(rule) + "):")
+                print('#', rule.descr() + ' (' + unicode_repr(rule) + '):')
             else:
-                print("#", rule.descr() + ":")
+                print('#', rule.descr() + ':')
             print(chunkstr)
 
     def _notrace_apply(self, chunkstr):
@@ -1067,7 +1083,7 @@ class RegexpChunkParser(ChunkParserI):
             used to define this ``RegexpChunkParser``.
         """
         if len(chunk_struct) == 0:
-            print("Warning: parsing empty text")
+            print('Warning: parsing empty text')
             return Tree(self._root_label, [])
 
         try:
@@ -1120,7 +1136,7 @@ class RegexpChunkParser(ChunkParserI):
         else:
             format = "    %s\n      %s\n"
         for rule in self._rules:
-            s += format % (rule.descr(), repr(rule))
+            s += format % (rule.descr(), unicode_repr(rule))
         return s[:-1]
 
 
@@ -1129,6 +1145,7 @@ class RegexpChunkParser(ChunkParserI):
 ##//////////////////////////////////////////////////////
 
 
+@python_2_unicode_compatible
 class RegexpParser(ChunkParserI):
     """
     A grammar based chunk parser.  ``chunk.RegexpParser`` uses a set of
@@ -1171,7 +1188,7 @@ class RegexpParser(ChunkParserI):
 
     """
 
-    def __init__(self, grammar, root_label="S", loop=1, trace=0):
+    def __init__(self, grammar, root_label='S', loop=1, trace=0):
         """
         Create a new chunk parser, from the given start state
         and set of chunk patterns.
@@ -1193,12 +1210,12 @@ class RegexpParser(ChunkParserI):
         self._grammar = grammar
         self._loop = loop
 
-        if isinstance(grammar, str):
+        if isinstance(grammar, string_types):
             self._read_grammar(grammar, root_label, trace)
         else:
             # Make sur the grammar looks like it has the right type:
             type_err = (
-                "Expected string or list of RegexpChunkParsers " "for the grammar."
+                'Expected string or list of RegexpChunkParsers ' 'for the grammar.'
             )
             try:
                 grammar = list(grammar)
@@ -1216,21 +1233,21 @@ class RegexpParser(ChunkParserI):
         """
         rules = []
         lhs = None
-        for line in grammar.split("\n"):
+        for line in grammar.split('\n'):
             line = line.strip()
 
             # New stage begins if there's an unescaped ':'
-            m = re.match("(?P<nonterminal>(\\.|[^:])*)(:(?P<rule>.*))", line)
+            m = re.match('(?P<nonterminal>(\\.|[^:])*)(:(?P<rule>.*))', line)
             if m:
                 # Record the stage that we just completed.
                 self._add_stage(rules, lhs, root_label, trace)
                 # Start a new stage.
-                lhs = m.group("nonterminal").strip()
+                lhs = m.group('nonterminal').strip()
                 rules = []
-                line = m.group("rule").strip()
+                line = m.group('rule').strip()
 
             # Skip blank & comment-only lines
-            if line == "" or line.startswith("#"):
+            if line == '' or line.startswith('#'):
                 continue
 
             # Add the rule
@@ -1245,7 +1262,7 @@ class RegexpParser(ChunkParserI):
         """
         if rules != []:
             if not lhs:
-                raise ValueError("Expected stage marker (eg NP:)")
+                raise ValueError('Expected stage marker (eg NP:)')
             parser = RegexpChunkParser(
                 rules, chunk_label=lhs, root_label=root_label, trace=trace
             )
@@ -1323,43 +1340,43 @@ def demo_eval(chunkparser, text):
     # Evaluate our chunk parser.
     chunkscore = chunk.ChunkScore()
 
-    for sentence in text.split("\n"):
+    for sentence in text.split('\n'):
         print(sentence)
         sentence = sentence.strip()
         if not sentence:
             continue
         gold = chunk.tagstr2tree(sentence)
         tokens = gold.leaves()
-        test = chunkparser.parse(Tree("S", tokens), trace=1)
+        test = chunkparser.parse(Tree('S', tokens), trace=1)
         chunkscore.score(gold, test)
         print()
 
-    print("/" + ("=" * 75) + "\\")
-    print("Scoring", chunkparser)
-    print(("-" * 77))
-    print("Precision: %5.1f%%" % (chunkscore.precision() * 100), " " * 4, end=" ")
-    print("Recall: %5.1f%%" % (chunkscore.recall() * 100), " " * 6, end=" ")
-    print("F-Measure: %5.1f%%" % (chunkscore.f_measure() * 100))
+    print('/' + ('=' * 75) + '\\')
+    print('Scoring', chunkparser)
+    print(('-' * 77))
+    print('Precision: %5.1f%%' % (chunkscore.precision() * 100), ' ' * 4, end=' ')
+    print('Recall: %5.1f%%' % (chunkscore.recall() * 100), ' ' * 6, end=' ')
+    print('F-Measure: %5.1f%%' % (chunkscore.f_measure() * 100))
 
     # Missed chunks.
     if chunkscore.missed():
-        print("Missed:")
+        print('Missed:')
         missed = chunkscore.missed()
         for chunk in missed[:10]:
-            print("  ", " ".join(map(str, chunk)))
+            print('  ', ' '.join(map(str, chunk)))
         if len(chunkscore.missed()) > 10:
-            print("  ...")
+            print('  ...')
 
     # Incorrect chunks.
     if chunkscore.incorrect():
-        print("Incorrect:")
+        print('Incorrect:')
         incorrect = chunkscore.incorrect()
         for chunk in incorrect[:10]:
-            print("  ", " ".join(map(str, chunk)))
+            print('  ', ' '.join(map(str, chunk)))
         if len(chunkscore.incorrect()) > 10:
-            print("  ...")
+            print('  ...')
 
-    print("\\" + ("=" * 75) + "/")
+    print('\\' + ('=' * 75) + '/')
     print()
 
 
@@ -1378,10 +1395,10 @@ def demo():
     [ John/NNP ] thinks/VBZ [ Mary/NN ] saw/VBD [ the/DT cat/NN ] sit/VB on/IN [ the/DT mat/NN ]./.
     """
 
-    print("*" * 75)
-    print("Evaluation text:")
+    print('*' * 75)
+    print('Evaluation text:')
     print(text)
-    print("*" * 75)
+    print('*' * 75)
     print()
 
     grammar = r"""
@@ -1426,7 +1443,7 @@ def demo():
     print("Demonstration of empty grammar:")
 
     cp = chunk.RegexpParser("")
-    print(chunk.accuracy(cp, conll2000.chunked_sents("test.txt", chunk_types=("NP",))))
+    print(chunk.accuracy(cp, conll2000.chunked_sents('test.txt', chunk_types=('NP',))))
 
     print()
     print("Demonstration of accuracy evaluation using CoNLL tags:")
@@ -1438,7 +1455,7 @@ def demo():
       <DT|JJ>{}<NN.*>     # merge det/adj with nouns
     """
     cp = chunk.RegexpParser(grammar)
-    print(chunk.accuracy(cp, conll2000.chunked_sents("test.txt")[:5]))
+    print(chunk.accuracy(cp, conll2000.chunked_sents('test.txt')[:5]))
 
     print()
     print("Demonstration of tagged token input")
@@ -1467,5 +1484,5 @@ def demo():
     )
 
 
-if __name__ == "__main__":
+if __name__ == '__main__':
     demo()
index 35ee79d..e29760d 100644 (file)
@@ -1,16 +1,18 @@
 # Natural Language Toolkit: Chunk format conversions
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Edward Loper <edloper@gmail.com>
 #         Steven Bird <stevenbird1@gmail.com> (minor additions)
 # URL: <http://nltk.org/>
 # For license information, see LICENSE.TXT
+from __future__ import print_function, unicode_literals, division
 
 import re
 
 from nltk.tree import Tree
 from nltk.tag.mapping import map_tag
 from nltk.tag.util import str2tuple
+from nltk.compat import python_2_unicode_compatible
 
 ##//////////////////////////////////////////////////////
 ## EVALUATION
@@ -118,10 +120,10 @@ class ChunkScore(object):
         self._tp = set()
         self._fp = set()
         self._fn = set()
-        self._max_tp = kwargs.get("max_tp_examples", 100)
-        self._max_fp = kwargs.get("max_fp_examples", 100)
-        self._max_fn = kwargs.get("max_fn_examples", 100)
-        self._chunk_label = kwargs.get("chunk_label", ".*")
+        self._max_tp = kwargs.get('max_tp_examples', 100)
+        self._max_fp = kwargs.get('max_fp_examples', 100)
+        self._max_fn = kwargs.get('max_fn_examples', 100)
+        self._chunk_label = kwargs.get('chunk_label', '.*')
         self._tp_num = 0
         self._fp_num = 0
         self._fn_num = 0
@@ -282,7 +284,7 @@ class ChunkScore(object):
 
         :rtype: str
         """
-        return "<ChunkScoring of " + repr(len(self)) + " chunks>"
+        return '<ChunkScoring of ' + repr(len(self)) + ' chunks>'
 
     def __str__(self):
         """
@@ -318,7 +320,7 @@ def _chunksets(t, count, chunk_label):
 
 
 def tagstr2tree(
-    s, chunk_label="NP", root_label="S", sep="/", source_tagset=None, target_tagset=None
+    s, chunk_label="NP", root_label="S", sep='/', source_tagset=None, target_tagset=None
 ):
     """
     Divide a string of bracketted tagged text into
@@ -337,20 +339,20 @@ def tagstr2tree(
     :rtype: Tree
     """
 
-    WORD_OR_BRACKET = re.compile(r"\[|\]|[^\[\]\s]+")
+    WORD_OR_BRACKET = re.compile(r'\[|\]|[^\[\]\s]+')
 
     stack = [Tree(root_label, [])]
     for match in WORD_OR_BRACKET.finditer(s):
         text = match.group()
-        if text[0] == "[":
+        if text[0] == '[':
             if len(stack) != 1:
-                raise ValueError("Unexpected [ at char {:d}".format(match.start()))
+                raise ValueError('Unexpected [ at char {:d}'.format(match.start()))
             chunk = Tree(chunk_label, [])
             stack[-1].append(chunk)
             stack.append(chunk)
-        elif text[0] == "]":
+        elif text[0] == ']':
             if len(stack) != 2:
-                raise ValueError("Unexpected ] at char {:d}".format(match.start()))
+                raise ValueError('Unexpected ] at char {:d}'.format(match.start()))
             stack.pop()
         else:
             if sep is None:
@@ -362,16 +364,16 @@ def tagstr2tree(
                 stack[-1].append((word, tag))
 
     if len(stack) != 1:
-        raise ValueError("Expected ] at char {:d}".format(len(s)))
+        raise ValueError('Expected ] at char {:d}'.format(len(s)))
     return stack[0]
 
 
 ### CONLL
 
-_LINE_RE = re.compile("(\S+)\s+(\S+)\s+([IOB])-?(\S+)?")
+_LINE_RE = re.compile('(\S+)\s+(\S+)\s+([IOB])-?(\S+)?')
 
 
-def conllstr2tree(s, chunk_types=("NP", "PP", "VP"), root_label="S"):
+def conllstr2tree(s, chunk_types=('NP', 'PP', 'VP'), root_label="S"):
     """
     Return a chunk structure for a single sentence
     encoded in the given CONLL 2000 style string.
@@ -391,29 +393,29 @@ def conllstr2tree(s, chunk_types=("NP", "PP", "VP"), root_label="S"):
 
     stack = [Tree(root_label, [])]
 
-    for lineno, line in enumerate(s.split("\n")):
+    for lineno, line in enumerate(s.split('\n')):
         if not line.strip():
             continue
 
         # Decode the line.
         match = _LINE_RE.match(line)
         if match is None:
-            raise ValueError("Error on line {:d}".format(lineno))
+            raise ValueError('Error on line {:d}'.format(lineno))
         (word, tag, state, chunk_type) = match.groups()
 
         # If it's a chunk type we don't care about, treat it as O.
         if chunk_types is not None and chunk_type not in chunk_types:
-            state = "O"
+            state = 'O'
 
         # For "Begin"/"Outside", finish any completed chunks -
         # also do so for "Inside" which don't match the previous token.
-        mismatch_I = state == "I" and chunk_type != stack[-1].label()
-        if state in "BO" or mismatch_I:
+        mismatch_I = state == 'I' and chunk_type != stack[-1].label()
+        if state in 'BO' or mismatch_I:
             if len(stack) == 2:
                 stack.pop()
 
         # For "Begin", start a new chunk.
-        if state == "B" or mismatch_I:
+        if state == 'B' or mismatch_I:
             chunk = Tree(chunk_type, [])
             stack[-1].append(chunk)
             stack.append(chunk)
@@ -452,7 +454,7 @@ def tree2conlltags(t):
 
 
 def conlltags2tree(
-    sentence, chunk_types=("NP", "PP", "VP"), root_label="S", strict=False
+    sentence, chunk_types=('NP', 'PP', 'VP'), root_label='S', strict=False
 ):
     """
     Convert the CoNLL IOB format to a tree.
@@ -465,9 +467,9 @@ def conlltags2tree(
             else:
                 # Treat as O
                 tree.append((word, postag))
-        elif chunktag.startswith("B-"):
+        elif chunktag.startswith('B-'):
             tree.append(Tree(chunktag[2:], [(word, postag)]))
-        elif chunktag.startswith("I-"):
+        elif chunktag.startswith('I-'):
             if (
                 len(tree) == 0
                 or not isinstance(tree[-1], Tree)
@@ -480,7 +482,7 @@ def conlltags2tree(
                     tree.append(Tree(chunktag[2:], [(word, postag)]))
             else:
                 tree[-1].append((word, postag))
-        elif chunktag == "O":
+        elif chunktag == 'O':
             tree.append((word, postag))
         else:
             raise ValueError("Bad conll tag {0!r}".format(chunktag))
@@ -497,20 +499,20 @@ def tree2conllstr(t):
     :rtype: str
     """
     lines = [" ".join(token) for token in tree2conlltags(t)]
-    return "\n".join(lines)
+    return '\n'.join(lines)
 
 
 ### IEER
 
 _IEER_DOC_RE = re.compile(
-    r"<DOC>\s*"
-    r"(<DOCNO>\s*(?P<docno>.+?)\s*</DOCNO>\s*)?"
-    r"(<DOCTYPE>\s*(?P<doctype>.+?)\s*</DOCTYPE>\s*)?"
-    r"(<DATE_TIME>\s*(?P<date_time>.+?)\s*</DATE_TIME>\s*)?"
-    r"<BODY>\s*"
-    r"(<HEADLINE>\s*(?P<headline>.+?)\s*</HEADLINE>\s*)?"
-    r"<TEXT>(?P<text>.*?)</TEXT>\s*"
-    r"</BODY>\s*</DOC>\s*",
+    r'<DOC>\s*'
+    r'(<DOCNO>\s*(?P<docno>.+?)\s*</DOCNO>\s*)?'
+    r'(<DOCTYPE>\s*(?P<doctype>.+?)\s*</DOCTYPE>\s*)?'
+    r'(<DATE_TIME>\s*(?P<date_time>.+?)\s*</DATE_TIME>\s*)?'
+    r'<BODY>\s*'
+    r'(<HEADLINE>\s*(?P<headline>.+?)\s*</HEADLINE>\s*)?'
+    r'<TEXT>(?P<text>.*?)</TEXT>\s*'
+    r'</BODY>\s*</DOC>\s*',
     re.DOTALL,
 )
 
@@ -523,17 +525,17 @@ def _ieer_read_text(s, root_label):
     # return the empty list in place of a Tree
     if s is None:
         return []
-    for piece_m in re.finditer("<[^>]+>|[^\s<]+", s):
+    for piece_m in re.finditer('<[^>]+>|[^\s<]+', s):
         piece = piece_m.group()
         try:
-            if piece.startswith("<b_"):
+            if piece.startswith('<b_'):
                 m = _IEER_TYPE_RE.match(piece)
                 if m is None:
-                    print("XXXX", piece)
-                chunk = Tree(m.group("type"), [])
+                    print('XXXX', piece)
+                chunk = Tree(m.group('type'), [])
                 stack[-1].append(chunk)
                 stack.append(chunk)
-            elif piece.startswith("<e_"):
+            elif piece.startswith('<e_'):
                 stack.pop()
             #           elif piece.startswith('<'):
             #               print "ERROR:", piece
@@ -542,25 +544,25 @@ def _ieer_read_text(s, root_label):
                 stack[-1].append(piece)
         except (IndexError, ValueError):
             raise ValueError(
-                "Bad IEER string (error at character {:d})".format(piece_m.start())
+                'Bad IEER string (error at character {:d})'.format(piece_m.start())
             )
     if len(stack) != 1:
-        raise ValueError("Bad IEER string")
+        raise ValueError('Bad IEER string')
     return stack[0]
 
 
 def ieerstr2tree(
     s,
     chunk_types=[
-        "LOCATION",
-        "ORGANIZATION",
-        "PERSON",
-        "DURATION",
-        "DATE",
-        "CARDINAL",
-        "PERCENT",
-        "MONEY",
-        "MEASURE",
+        'LOCATION',
+        'ORGANIZATION',
+        'PERSON',
+        'DURATION',
+        'DATE',
+        'CARDINAL',
+        'PERCENT',
+        'MONEY',
+        'MEASURE',
     ],
     root_label="S",
 ):
@@ -580,13 +582,13 @@ def ieerstr2tree(
     m = _IEER_DOC_RE.match(s)
     if m:
         return {
-            "text": _ieer_read_text(m.group("text"), root_label),
-            "docno": m.group("docno"),
-            "doctype": m.group("doctype"),
-            "date_time": m.group("date_time"),
+            'text': _ieer_read_text(m.group('text'), root_label),
+            'docno': m.group('docno'),
+            'doctype': m.group('doctype'),
+            'date_time': m.group('date_time'),
             #'headline': m.group('headline')
             # we want to capture NEs in the headline too!
-            "headline": _ieer_read_text(m.group("headline"), root_label),
+            'headline': _ieer_read_text(m.group('headline'), root_label),
         }
     else:
         return _ieer_read_text(s, root_label)
@@ -597,7 +599,7 @@ def demo():
     s = "[ Pierre/NNP Vinken/NNP ] ,/, [ 61/CD years/NNS ] old/JJ ,/, will/MD join/VB [ the/DT board/NN ] ./."
     import nltk
 
-    t = nltk.chunk.tagstr2tree(s, chunk_label="NP")
+    t = nltk.chunk.tagstr2tree(s, chunk_label='NP')
     t.pprint()
     print()
 
@@ -631,7 +633,7 @@ better JJR I-ADJP
 . . O
 """
 
-    conll_tree = conllstr2tree(s, chunk_types=("NP", "PP"))
+    conll_tree = conllstr2tree(s, chunk_types=('NP', 'PP'))
     conll_tree.pprint()
 
     # Demonstrate CoNLL output
@@ -640,5 +642,5 @@ better JJR I-ADJP
     print()
 
 
-if __name__ == "__main__":
+if __name__ == '__main__':
     demo()
index 36b5059..551c82c 100644 (file)
@@ -1,6 +1,6 @@
 # Natural Language Toolkit: Classifiers
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Edward Loper <edloper@gmail.com>
 # URL: <http://nltk.org/>
 # For license information, see LICENSE.TXT
@@ -57,7 +57,7 @@ to the classifier:
     >>> from nltk.corpus import gutenberg
     >>> for fileid in gutenberg.fileids(): # doctest: +SKIP
     ...     doc = gutenberg.words(fileid) # doctest: +SKIP
-    ...     print(fileid, classifier.classify(document_features(doc))) # doctest: +SKIP
+    ...     print fileid, classifier.classify(document_features(doc)) # doctest: +SKIP
 
 The parameters that a feature detector expects will vary, depending on
 the task and the needs of the feature detector.  For example, a
index 7df3038..0e6a47a 100644 (file)
Binary files a/nlp_resource_data/nltk/classify/__pycache__/__init__.cpython-37.pyc and b/nlp_resource_data/nltk/classify/__pycache__/__init__.cpython-37.pyc differ
index 8584b79..1c924a1 100644 (file)
Binary files a/nlp_resource_data/nltk/classify/__pycache__/api.cpython-37.pyc and b/nlp_resource_data/nltk/classify/__pycache__/api.cpython-37.pyc differ
index 5f56ab9..599c0fe 100644 (file)
Binary files a/nlp_resource_data/nltk/classify/__pycache__/decisiontree.cpython-37.pyc and b/nlp_resource_data/nltk/classify/__pycache__/decisiontree.cpython-37.pyc differ
index f15741b..86c4620 100644 (file)
Binary files a/nlp_resource_data/nltk/classify/__pycache__/maxent.cpython-37.pyc and b/nlp_resource_data/nltk/classify/__pycache__/maxent.cpython-37.pyc differ
index bee0930..9bfcd63 100644 (file)
Binary files a/nlp_resource_data/nltk/classify/__pycache__/megam.cpython-37.pyc and b/nlp_resource_data/nltk/classify/__pycache__/megam.cpython-37.pyc differ
index c78ac74..10f7cf9 100644 (file)
Binary files a/nlp_resource_data/nltk/classify/__pycache__/naivebayes.cpython-37.pyc and b/nlp_resource_data/nltk/classify/__pycache__/naivebayes.cpython-37.pyc differ
index 82d8249..578f0eb 100644 (file)
Binary files a/nlp_resource_data/nltk/classify/__pycache__/positivenaivebayes.cpython-37.pyc and b/nlp_resource_data/nltk/classify/__pycache__/positivenaivebayes.cpython-37.pyc differ
index 3045dfd..7e50e97 100644 (file)
Binary files a/nlp_resource_data/nltk/classify/__pycache__/rte_classify.cpython-37.pyc and b/nlp_resource_data/nltk/classify/__pycache__/rte_classify.cpython-37.pyc differ
index 048bb8a..48a9e22 100644 (file)
Binary files a/nlp_resource_data/nltk/classify/__pycache__/scikitlearn.cpython-37.pyc and b/nlp_resource_data/nltk/classify/__pycache__/scikitlearn.cpython-37.pyc differ
index 286c8c5..2f80331 100644 (file)
Binary files a/nlp_resource_data/nltk/classify/__pycache__/senna.cpython-37.pyc and b/nlp_resource_data/nltk/classify/__pycache__/senna.cpython-37.pyc differ
index 38da623..73f364e 100644 (file)
Binary files a/nlp_resource_data/nltk/classify/__pycache__/svm.cpython-37.pyc and b/nlp_resource_data/nltk/classify/__pycache__/svm.cpython-37.pyc differ
index 649d9d1..7a3efde 100644 (file)
Binary files a/nlp_resource_data/nltk/classify/__pycache__/tadm.cpython-37.pyc and b/nlp_resource_data/nltk/classify/__pycache__/tadm.cpython-37.pyc differ
index c0b9144..87b169f 100644 (file)
Binary files a/nlp_resource_data/nltk/classify/__pycache__/textcat.cpython-37.pyc and b/nlp_resource_data/nltk/classify/__pycache__/textcat.cpython-37.pyc differ
index d46f923..00cf657 100644 (file)
Binary files a/nlp_resource_data/nltk/classify/__pycache__/util.cpython-37.pyc and b/nlp_resource_data/nltk/classify/__pycache__/util.cpython-37.pyc differ
index cf63549..19fd5d1 100644 (file)
Binary files a/nlp_resource_data/nltk/classify/__pycache__/weka.cpython-37.pyc and b/nlp_resource_data/nltk/classify/__pycache__/weka.cpython-37.pyc differ
index ba6d88e..91987c1 100644 (file)
@@ -1,6 +1,6 @@
 # Natural Language Toolkit: Classifier Interface
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Edward Loper <edloper@gmail.com>
 #         Steven Bird <stevenbird1@gmail.com> (minor additions)
 # URL: <http://nltk.org/>
index 10c784c..0739cf4 100644 (file)
@@ -1,6 +1,6 @@
 # Natural Language Toolkit: Decision Tree Classifiers
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Edward Loper <edloper@gmail.com>
 # URL: <http://nltk.org/>
 # For license information, see LICENSE.TXT
@@ -10,13 +10,16 @@ A classifier model that decides which label to assign to a token on
 the basis of a tree structure, where branches correspond to conditions
 on feature values, and leaves correspond to label assignments.
 """
+from __future__ import print_function, unicode_literals, division
 
 from collections import defaultdict
 
 from nltk.probability import FreqDist, MLEProbDist, entropy
 from nltk.classify.api import ClassifierI
+from nltk.compat import python_2_unicode_compatible
 
 
+@python_2_unicode_compatible
 class DecisionTreeClassifier(ClassifierI):
     def __init__(self, label, feature_name=None, decisions=None, default=None):
         """
@@ -69,7 +72,7 @@ class DecisionTreeClassifier(ClassifierI):
                 errors += 1
         return errors / len(labeled_featuresets)
 
-    def pretty_format(self, width=70, prefix="", depth=4):
+    def pretty_format(self, width=70, prefix='', depth=4):
         """
         Return a string containing a pretty-printed version of this
         decision tree.  Each line in this string corresponds to a
@@ -81,24 +84,20 @@ class DecisionTreeClassifier(ClassifierI):
             n = width - len(prefix) - 15
             return '{0}{1} {2}\n'.format(prefix, '.' * n, self._label)
         s = ''
-        for i, (fval, result) in enumerate(sorted(self._decisions.items(), 
-                                                  key=lambda item: 
-                                                  (item[0] in [None, False, True], str(item[0]).lower())
-                                                 )
-                                          ):
+        for i, (fval, result) in enumerate(sorted(self._decisions.items())):
             hdr = '{0}{1}={2}? '.format(prefix, self._fname, fval)
             n = width - 15 - len(hdr)
-            s += "{0}{1} {2}\n".format(hdr, "." * (n), result._label)
+            s += '{0}{1} {2}\n'.format(hdr, '.' * (n), result._label)
             if result._fname is not None and depth > 1:
-                s += result.pretty_format(width, prefix + "  ", depth - 1)
+                s += result.pretty_format(width, prefix + '  ', depth - 1)
         if self._default is not None:
             n = width - len(prefix) - 21
-            s += "{0}else: {1} {2}\n".format(prefix, "." * n, self._default._label)
+            s += '{0}else: {1} {2}\n'.format(prefix, '.' * n, self._default._label)
             if self._default._fname is not None and depth > 1:
-                s += self._default.pretty_format(width, prefix + "  ", depth - 1)
+                s += self._default.pretty_format(width, prefix + '  ', depth - 1)
         return s
 
-    def pseudocode(self, prefix="", depth=4):
+    def pseudocode(self, prefix='', depth=4):
         """
         Return a string representation of this decision tree that
         expresses the decisions it makes as a nested set of pseudocode
@@ -107,26 +106,23 @@ class DecisionTreeClassifier(ClassifierI):
         if self._fname is None:
             return "{0}return {1!r}\n".format(prefix, self._label)
         s = ''
-        for (fval, result) in sorted(self._decisions.items(),
-                                    key=lambda item: 
-                                     (item[0] in [None, False, True], str(item[0]).lower())
-                                    ):
+        for (fval, result) in sorted(self._decisions.items()):
             s += '{0}if {1} == {2!r}: '.format(prefix, self._fname, fval)
             if result._fname is not None and depth > 1:
-                s += "\n" + result.pseudocode(prefix + "  ", depth - 1)
+                s += '\n' + result.pseudocode(prefix + '  ', depth - 1)
             else:
-                s += "return {0!r}\n".format(result._label)
+                s += 'return {0!r}\n'.format(result._label)
         if self._default is not None:
             if len(self._decisions) == 1:
-                s += "{0}if {1} != {2!r}: ".format(
+                s += '{0}if {1} != {2!r}: '.format(
                     prefix, self._fname, list(self._decisions.keys())[0]
                 )
             else:
-                s += "{0}else: ".format(prefix)
+                s += '{0}else: '.format(prefix)
             if self._default._fname is not None and depth > 1:
-                s += "\n" + self._default.pseudocode(prefix + "  ", depth - 1)
+                s += '\n' + self._default.pseudocode(prefix + '  ', depth - 1)
             else:
-                s += "return {0!r}\n".format(self._default._label)
+                s += 'return {0!r}\n'.format(self._default._label)
         return s
 
     def __str__(self):
@@ -269,7 +265,7 @@ class DecisionTreeClassifier(ClassifierI):
         if verbose:
             print(
                 (
-                    "best stump for {:6d} toks uses {:20} err={:6.4f}".format(
+                    'best stump for {:6d} toks uses {:20} err={:6.4f}'.format(
                         len(labeled_featuresets), best_stump._fname, best_error
                     )
                 )
@@ -316,14 +312,14 @@ class DecisionTreeClassifier(ClassifierI):
                     best_stump = stump
         if verbose:
             if best_stump._decisions:
-                descr = "{0}={1}".format(
+                descr = '{0}={1}'.format(
                     best_stump._fname, list(best_stump._decisions.keys())[0]
                 )
             else:
-                descr = "(default)"
+                descr = '(default)'
             print(
                 (
-                    "best stump for {:6d} toks uses {:20} err={:6.4f}".format(
+                    'best stump for {:6d} toks uses {:20} err={:6.4f}'.format(
                         len(labeled_featuresets), descr, best_error
                     )
                 )
@@ -346,9 +342,9 @@ def demo():
     classifier = names_demo(
         f, binary_names_demo_features  # DecisionTreeClassifier.train,
     )
-    print(classifier.pretty_format(depth=7))
+    print(classifier.pp(depth=7))
     print(classifier.pseudocode(depth=7))
 
 
-if __name__ == "__main__":
+if __name__ == '__main__':
     demo()
index 7a03f81..e74b676 100644 (file)
@@ -1,6 +1,6 @@
 # Natural Language Toolkit: Maximum Entropy Classifiers
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Edward Loper <edloper@gmail.com>
 #         Dmitry Chichkov <dchichkov@gmail.com> (TypedMaxentFeatureEncoding)
 # URL: <http://nltk.org/>
@@ -51,6 +51,8 @@ For all values of ``feat_val`` and ``some_label``.  This mapping is
 performed by classes that implement the ``MaxentFeatureEncodingI``
 interface.
 """
+from __future__ import print_function, unicode_literals
+
 try:
     import numpy
 except ImportError:
@@ -60,6 +62,9 @@ import tempfile
 import os
 from collections import defaultdict
 
+from six import integer_types
+
+from nltk import compat
 from nltk.data import gzip_open_unicode
 from nltk.util import OrderedDict
 from nltk.probability import DictionaryProbDist
@@ -69,13 +74,14 @@ from nltk.classify.util import CutoffChecker, accuracy, log_likelihood
 from nltk.classify.megam import call_megam, write_megam_file, parse_megam_weights
 from nltk.classify.tadm import call_tadm, write_tadm_file, parse_tadm_weights
 
-__docformat__ = "epytext en"
+__docformat__ = 'epytext en'
 
 ######################################################################
 # { Classifier Model
 ######################################################################
 
 
+@compat.python_2_unicode_compatible
 class MaxentClassifier(ClassifierI):
     """
     A maximum entropy classifier (also known as a "conditional
@@ -167,16 +173,16 @@ class MaxentClassifier(ClassifierI):
         probabilities of each label for that featureset.
         """
         descr_width = 50
-        TEMPLATE = "  %-" + str(descr_width - 2) + "s%s%8.3f"
+        TEMPLATE = '  %-' + str(descr_width - 2) + 's%s%8.3f'
 
         pdist = self.prob_classify(featureset)
         labels = sorted(pdist.samples(), key=pdist.prob, reverse=True)
         labels = labels[:columns]
         print(
-            "  Feature".ljust(descr_width)
-            + "".join("%8s" % (("%s" % l)[:7]) for l in labels)
+            '  Feature'.ljust(descr_width)
+            + ''.join('%8s' % (("%s" % l)[:7]) for l in labels)
         )
-        print("  " + "-" * (descr_width - 2 + 8 * len(labels)))
+        print('  ' + '-' * (descr_width - 2 + 8 * len(labels)))
         sums = defaultdict(int)
         for i, label in enumerate(labels):
             feature_vector = self._encoding.encode(featureset, label)
@@ -189,26 +195,26 @@ class MaxentClassifier(ClassifierI):
                 else:
                     score = self._weights[f_id] ** f_val
                 descr = self._encoding.describe(f_id)
-                descr = descr.split(" and label is ")[0]  # hack
-                descr += " (%s)" % f_val  # hack
+                descr = descr.split(' and label is ')[0]  # hack
+                descr += ' (%s)' % f_val  # hack
                 if len(descr) > 47:
-                    descr = descr[:44] + "..."
-                print(TEMPLATE % (descr, i * 8 * " ", score))
+                    descr = descr[:44] + '...'
+                print(TEMPLATE % (descr, i * 8 * ' ', score))
                 sums[label] += score
-        print("  " + "-" * (descr_width - 1 + 8 * len(labels)))
+        print('  ' + '-' * (descr_width - 1 + 8 * len(labels)))
         print(
-            "  TOTAL:".ljust(descr_width) + "".join("%8.3f" % sums[l] for l in labels)
+            '  TOTAL:'.ljust(descr_width) + ''.join('%8.3f' % sums[l] for l in labels)
         )
         print(
-            "  PROBS:".ljust(descr_width)
-            + "".join("%8.3f" % pdist.prob(l) for l in labels)
+            '  PROBS:'.ljust(descr_width)
+            + ''.join('%8.3f' % pdist.prob(l) for l in labels)
         )
 
     def most_informative_features(self, n=10):
         """
         Generates the ranked list of informative features from most to least.
         """
-        if hasattr(self, "_most_informative_features"):
+        if hasattr(self, '_most_informative_features'):
             return self._most_informative_features[:n]
         else:
             self._most_informative_features = sorted(
@@ -218,7 +224,7 @@ class MaxentClassifier(ClassifierI):
             )
             return self._most_informative_features[:n]
 
-    def show_most_informative_features(self, n=10, show="all"):
+    def show_most_informative_features(self, n=10, show='all'):
         """
         :param show: all, neg, or pos (for negative-only or positive-only)
         :type show: str
@@ -227,22 +233,22 @@ class MaxentClassifier(ClassifierI):
         """
         # Use None the full list of ranked features.
         fids = self.most_informative_features(None)
-        if show == "pos":
+        if show == 'pos':
             fids = [fid for fid in fids if self._weights[fid] > 0]
-        elif show == "neg":
+        elif show == 'neg':
             fids = [fid for fid in fids if self._weights[fid] < 0]
         for fid in fids[:n]:
-            print("%8.3f %s" % (self._weights[fid], self._encoding.describe(fid)))
+            print('%8.3f %s' % (self._weights[fid], self._encoding.describe(fid)))
 
     def __repr__(self):
-        return "<ConditionalExponentialClassifier: %d labels, %d features>" % (
+        return '<ConditionalExponentialClassifier: %d labels, %d features>' % (
             len(self._encoding.labels()),
             self._encoding.length(),
         )
 
     #: A list of the algorithm names that are accepted for the
     #: ``train()`` method's ``algorithm`` parameter.
-    ALGORITHMS = ["GIS", "IIS", "MEGAM", "TADM"]
+    ALGORITHMS = ['GIS', 'IIS', 'MEGAM', 'TADM']
 
     @classmethod
     def train(
@@ -307,42 +313,42 @@ class MaxentClassifier(ClassifierI):
               log likelihood by less than ``v``.
         """
         if algorithm is None:
-            algorithm = "iis"
+            algorithm = 'iis'
         for key in cutoffs:
             if key not in (
-                "max_iter",
-                "min_ll",
-                "min_lldelta",
-                "max_acc",
-                "min_accdelta",
-                "count_cutoff",
-                "norm",
-                "explicit",
-                "bernoulli",
+                'max_iter',
+                'min_ll',
+                'min_lldelta',
+                'max_acc',
+                'min_accdelta',
+                'count_cutoff',
+                'norm',
+                'explicit',
+                'bernoulli',
             ):
-                raise TypeError("Unexpected keyword arg %r" % key)
+                raise TypeError('Unexpected keyword arg %r' % key)
         algorithm = algorithm.lower()
-        if algorithm == "iis":
+        if algorithm == 'iis':
             return train_maxent_classifier_with_iis(
                 train_toks, trace, encoding, labels, **cutoffs
             )
-        elif algorithm == "gis":
+        elif algorithm == 'gis':
             return train_maxent_classifier_with_gis(
                 train_toks, trace, encoding, labels, **cutoffs
             )
-        elif algorithm == "megam":
+        elif algorithm == 'megam':
             return train_maxent_classifier_with_megam(
                 train_toks, trace, encoding, labels, gaussian_prior_sigma, **cutoffs
             )
-        elif algorithm == "tadm":
+        elif algorithm == 'tadm':
             kwargs = cutoffs
-            kwargs["trace"] = trace
-            kwargs["encoding"] = encoding
-            kwargs["labels"] = labels
-            kwargs["gaussian_prior_sigma"] = gaussian_prior_sigma
+            kwargs['trace'] = trace
+            kwargs['encoding'] = encoding
+            kwargs['labels'] = labels
+            kwargs['gaussian_prior_sigma'] = gaussian_prior_sigma
             return TadmMaxentClassifier.train(train_toks, **kwargs)
         else:
-            raise ValueError("Unknown algorithm %s" % algorithm)
+            raise ValueError('Unknown algorithm %s' % algorithm)
 
 
 #: Alias for MaxentClassifier.
@@ -471,7 +477,7 @@ class FunctionBackedMaxentFeatureEncoding(MaxentFeatureEncodingI):
         return self._labels
 
     def describe(self, fid):
-        return "no description available"
+        return 'no description available'
 
 
 class BinaryMaxentFeatureEncoding(MaxentFeatureEncodingI):
@@ -534,8 +540,8 @@ class BinaryMaxentFeatureEncoding(MaxentFeatureEncodingI):
         """
         if set(mapping.values()) != set(range(len(mapping))):
             raise ValueError(
-                "Mapping values must be exactly the "
-                "set of integers from 0...len(mapping)"
+                'Mapping values must be exactly the '
+                'set of integers from 0...len(mapping)'
             )
 
         self._labels = list(labels)
@@ -595,8 +601,8 @@ class BinaryMaxentFeatureEncoding(MaxentFeatureEncodingI):
 
     def describe(self, f_id):
         # Inherit docs.
-        if not isinstance(f_id, int):
-            raise TypeError("describe() expected an int")
+        if not isinstance(f_id, integer_types):
+            raise TypeError('describe() expected an int')
         try:
             self._inv_mapping
         except AttributeError:
@@ -606,17 +612,17 @@ class BinaryMaxentFeatureEncoding(MaxentFeatureEncodingI):
 
         if f_id < len(self._mapping):
             (fname, fval, label) = self._inv_mapping[f_id]
-            return "%s==%r and label is %r" % (fname, fval, label)
+            return '%s==%r and label is %r' % (fname, fval, label)
         elif self._alwayson and f_id in self._alwayson.values():
             for (label, f_id2) in self._alwayson.items():
                 if f_id == f_id2:
-                    return "label is %r" % label
+                    return 'label is %r' % label
         elif self._unseen and f_id in self._unseen.values():
             for (fname, f_id2) in self._unseen.items():
                 if f_id == f_id2:
-                    return "%s is unseen" % fname
+                    return '%s is unseen' % fname
         else:
-            raise ValueError("Bad feature id")
+            raise ValueError('Bad feature id')
 
     def labels(self):
         # Inherit docs.
@@ -660,7 +666,7 @@ class BinaryMaxentFeatureEncoding(MaxentFeatureEncodingI):
 
         for (tok, label) in train_toks:
             if labels and label not in labels:
-                raise ValueError("Unexpected label %s" % label)
+                raise ValueError('Unexpected label %s' % label)
             seen_labels.add(label)
 
             # Record each of the features.
@@ -724,7 +730,7 @@ class GISEncoding(BinaryMaxentFeatureEncoding):
         # Add a correction feature.
         total = sum(v for (f, v) in encoding)
         if total >= self._C:
-            raise ValueError("Correction feature is not high enough!")
+            raise ValueError('Correction feature is not high enough!')
         encoding.append((base_length, self._C - total))
 
         # Return the result
@@ -735,7 +741,7 @@ class GISEncoding(BinaryMaxentFeatureEncoding):
 
     def describe(self, f_id):
         if f_id == BinaryMaxentFeatureEncoding.length(self):
-            return "Correction feature (%s)" % self._C
+            return 'Correction feature (%s)' % self._C
         else:
             return BinaryMaxentFeatureEncoding.describe(self, f_id)
 
@@ -872,8 +878,8 @@ class TypedMaxentFeatureEncoding(MaxentFeatureEncodingI):
         """
         if set(mapping.values()) != set(range(len(mapping))):
             raise ValueError(
-                "Mapping values must be exactly the "
-                "set of integers from 0...len(mapping)"
+                'Mapping values must be exactly the '
+                'set of integers from 0...len(mapping)'
             )
 
         self._labels = list(labels)
@@ -910,7 +916,7 @@ class TypedMaxentFeatureEncoding(MaxentFeatureEncodingI):
 
         # Convert input-features to joint-features:
         for fname, fval in featureset.items():
-            if isinstance(fval, (int, float)):
+            if isinstance(fval, (integer_types, float)):
                 # Known feature name & value:
                 if (fname, type(fval), label) in self._mapping:
                     encoding.append((self._mapping[fname, type(fval), label], fval))
@@ -938,8 +944,8 @@ class TypedMaxentFeatureEncoding(MaxentFeatureEncodingI):
 
     def describe(self, f_id):
         # Inherit docs.
-        if not isinstance(f_id, int):
-            raise TypeError("describe() expected an int")
+        if not isinstance(f_id, integer_types):
+            raise TypeError('describe() expected an int')
         try:
             self._inv_mapping
         except AttributeError:
@@ -949,17 +955,17 @@ class TypedMaxentFeatureEncoding(MaxentFeatureEncodingI):
 
         if f_id < len(self._mapping):
             (fname, fval, label) = self._inv_mapping[f_id]
-            return "%s==%r and label is %r" % (fname, fval, label)
+            return '%s==%r and label is %r' % (fname, fval, label)
         elif self._alwayson and f_id in self._alwayson.values():
             for (label, f_id2) in self._alwayson.items():
                 if f_id == f_id2:
-                    return "label is %r" % label
+                    return 'label is %r' % label
         elif self._unseen and f_id in self._unseen.values():
             for (fname, f_id2) in self._unseen.items():
                 if f_id == f_id2:
-                    return "%s is unseen" % fname
+                    return '%s is unseen' % fname
         else:
-            raise ValueError("Bad feature id")
+            raise ValueError('Bad feature id')
 
     def labels(self):
         # Inherit docs.
@@ -1006,7 +1012,7 @@ class TypedMaxentFeatureEncoding(MaxentFeatureEncodingI):
 
         for (tok, label) in train_toks:
             if labels and label not in labels:
-                raise ValueError("Unexpected label %s" % label)
+                raise ValueError('Unexpected label %s' % label)
             seen_labels.add(label)
 
             # Record each of the features.
@@ -1043,17 +1049,17 @@ def train_maxent_classifier_with_gis(
 
     :see: ``train_maxent_classifier()`` for parameter descriptions.
     """
-    cutoffs.setdefault("max_iter", 100)
+    cutoffs.setdefault('max_iter', 100)
     cutoffchecker = CutoffChecker(cutoffs)
 
     # Construct an encoding from the training data.
     if encoding is None:
         encoding = GISEncoding.train(train_toks, labels=labels)
 
-    if not hasattr(encoding, "C"):
+    if not hasattr(encoding, 'C'):
         raise TypeError(
-            "The GIS algorithm requires an encoding that "
-            "defines C (e.g., GISEncoding)."
+            'The GIS algorithm requires an encoding that '
+            'defines C (e.g., GISEncoding).'
         )
 
     # Cinv is the inverse of the sum of each joint feature vector.
@@ -1069,7 +1075,7 @@ def train_maxent_classifier_with_gis(
 
     # Build the classifier.  Start with weight=0 for each attested
     # feature, and weight=-infinity for each unattested feature.
-    weights = numpy.zeros(len(empirical_fcount), "d")
+    weights = numpy.zeros(len(empirical_fcount), 'd')
     for fid in unattested:
         weights[fid] = numpy.NINF
     classifier = ConditionalExponentialClassifier(encoding, weights)
@@ -1079,11 +1085,11 @@ def train_maxent_classifier_with_gis(
     del empirical_fcount
 
     if trace > 0:
-        print("  ==> Training (%d iterations)" % cutoffs["max_iter"])
+        print('  ==> Training (%d iterations)' % cutoffs['max_iter'])
     if trace > 2:
         print()
-        print("      Iteration    Log Likelihood    Accuracy")
-        print("      ---------------------------------------")
+        print('      Iteration    Log Likelihood    Accuracy')
+        print('      ---------------------------------------')
 
     # Train the classifier.
     try:
@@ -1092,7 +1098,7 @@ def train_maxent_classifier_with_gis(
                 ll = cutoffchecker.ll or log_likelihood(classifier, train_toks)
                 acc = cutoffchecker.acc or accuracy(classifier, train_toks)
                 iternum = cutoffchecker.iter
-                print("     %9d    %14.5f    %9.3f" % (iternum, ll, acc))
+                print('     %9d    %14.5f    %9.3f' % (iternum, ll, acc))
 
             # Use the model to estimate the number of times each
             # feature should occur in the training data.
@@ -1116,21 +1122,21 @@ def train_maxent_classifier_with_gis(
                 break
 
     except KeyboardInterrupt:
-        print("      Training stopped: keyboard interrupt")
+        print('      Training stopped: keyboard interrupt')
     except:
         raise
 
     if trace > 2:
         ll = log_likelihood(classifier, train_toks)
         acc = accuracy(classifier, train_toks)
-        print("         Final    %14.5f    %9.3f" % (ll, acc))
+        print('         Final    %14.5f    %9.3f' % (ll, acc))
 
     # Return the classifier.
     return classifier
 
 
 def calculate_empirical_fcount(train_toks, encoding):
-    fcount = numpy.zeros(encoding.length(), "d")
+    fcount = numpy.zeros(encoding.length(), 'd')
 
     for tok, label in train_toks:
         for (index, val) in encoding.encode(tok, label):
@@ -1140,7 +1146,7 @@ def calculate_empirical_fcount(train_toks, encoding):
 
 
 def calculate_estimated_fcount(classifier, train_toks, encoding):
-    fcount = numpy.zeros(encoding.length(), "d")
+    fcount = numpy.zeros(encoding.length(), 'd')
 
     for tok, label in train_toks:
         pdist = classifier.prob_classify(tok)
@@ -1169,7 +1175,7 @@ def train_maxent_classifier_with_iis(
 
     :see: ``train_maxent_classifier()`` for parameter descriptions.
     """
-    cutoffs.setdefault("max_iter", 100)
+    cutoffs.setdefault('max_iter', 100)
     cutoffchecker = CutoffChecker(cutoffs)
 
     # Construct an encoding from the training data.
@@ -1185,7 +1191,7 @@ def train_maxent_classifier_with_iis(
     # nfarray performs the reverse operation.  nfident is
     # nfarray multiplied by an identity matrix.
     nfmap = calculate_nfmap(train_toks, encoding)
-    nfarray = numpy.array(sorted(nfmap, key=nfmap.__getitem__), "d")
+    nfarray = numpy.array(sorted(nfmap, key=nfmap.__getitem__), 'd')
     nftranspose = numpy.reshape(nfarray, (len(nfarray), 1))
 
     # Check for any features that are not attested in train_toks.
@@ -1193,17 +1199,17 @@ def train_maxent_classifier_with_iis(
 
     # Build the classifier.  Start with weight=0 for each attested
     # feature, and weight=-infinity for each unattested feature.
-    weights = numpy.zeros(len(empirical_ffreq), "d")
+    weights = numpy.zeros(len(empirical_ffreq), 'd')
     for fid in unattested:
         weights[fid] = numpy.NINF
     classifier = ConditionalExponentialClassifier(encoding, weights)
 
     if trace > 0:
-        print("  ==> Training (%d iterations)" % cutoffs["max_iter"])
+        print('  ==> Training (%d iterations)' % cutoffs['max_iter'])
     if trace > 2:
         print()
-        print("      Iteration    Log Likelihood    Accuracy")
-        print("      ---------------------------------------")
+        print('      Iteration    Log Likelihood    Accuracy')
+        print('      ---------------------------------------')
 
     # Train the classifier.
     try:
@@ -1212,7 +1218,7 @@ def train_maxent_classifier_with_iis(
                 ll = cutoffchecker.ll or log_likelihood(classifier, train_toks)
                 acc = cutoffchecker.acc or accuracy(classifier, train_toks)
                 iternum = cutoffchecker.iter
-                print("     %9d    %14.5f    %9.3f" % (iternum, ll, acc))
+                print('     %9d    %14.5f    %9.3f' % (iternum, ll, acc))
 
             # Calculate the deltas for this iteration, using Newton's method.
             deltas = calculate_deltas(
@@ -1236,14 +1242,14 @@ def train_maxent_classifier_with_iis(
                 break
 
     except KeyboardInterrupt:
-        print("      Training stopped: keyboard interrupt")
+        print('      Training stopped: keyboard interrupt')
     except:
         raise
 
     if trace > 2:
         ll = log_likelihood(classifier, train_toks)
         acc = accuracy(classifier, train_toks)
-        print("         Final    %14.5f    %9.3f" % (ll, acc))
+        print('         Final    %14.5f    %9.3f' % (ll, acc))
 
     # Return the classifier.
     return classifier
@@ -1353,12 +1359,12 @@ def calculate_deltas(
     NEWTON_CONVERGE = 1e-12
     MAX_NEWTON = 300
 
-    deltas = numpy.ones(encoding.length(), "d")
+    deltas = numpy.ones(encoding.length(), 'd')
 
     # Precompute the A matrix:
     # A[nf][id] = sum ( p(fs) * p(label|fs) * f(fs,label) )
     # over all label,fs s.t. num_features[label,fs]=nf
-    A = numpy.zeros((len(nfmap), encoding.length()), "d")
+    A = numpy.zeros((len(nfmap), encoding.length()), 'd')
 
     for tok, label in train_toks:
         dist = classifier.prob_classify(tok)
@@ -1427,40 +1433,40 @@ def train_maxent_classifier_with_megam(
 
     explicit = True
     bernoulli = True
-    if "explicit" in kwargs:
-        explicit = kwargs["explicit"]
-    if "bernoulli" in kwargs:
-        bernoulli = kwargs["bernoulli"]
+    if 'explicit' in kwargs:
+        explicit = kwargs['explicit']
+    if 'bernoulli' in kwargs:
+        bernoulli = kwargs['bernoulli']
 
     # Construct an encoding from the training data.
     if encoding is None:
         # Count cutoff can also be controlled by megam with the -minfc
         # option. Not sure where the best place for it is.
-        count_cutoff = kwargs.get("count_cutoff", 0)
+        count_cutoff = kwargs.get('count_cutoff', 0)
         encoding = BinaryMaxentFeatureEncoding.train(
             train_toks, count_cutoff, labels=labels, alwayson_features=True
         )
     elif labels is not None:
-        raise ValueError("Specify encoding or labels, not both")
+        raise ValueError('Specify encoding or labels, not both')
 
     # Write a training file for megam.
     try:
-        fd, trainfile_name = tempfile.mkstemp(prefix="nltk-")
-        with open(trainfile_name, "w") as trainfile:
+        fd, trainfile_name = tempfile.mkstemp(prefix='nltk-')
+        with open(trainfile_name, 'w') as trainfile:
             write_megam_file(
                 train_toks, encoding, trainfile, explicit=explicit, bernoulli=bernoulli
             )
         os.close(fd)
     except (OSError, IOError, ValueError) as e:
-        raise ValueError("Error while creating megam training file: %s" % e)
+        raise ValueError('Error while creating megam training file: %s' % e)
 
     # Run megam on the training file.
     options = []
-    options += ["-nobias", "-repeat", "10"]
+    options += ['-nobias', '-repeat', '10']
     if explicit:
-        options += ["-explicit"]
+        options += ['-explicit']
     if not bernoulli:
-        options += ["-fvals"]
+        options += ['-fvals']
     if gaussian_prior_sigma:
         # Lambda is just the precision of the Gaussian prior, i.e. it's the
         # inverse variance, so the parameter conversion is 1.0/sigma**2.
@@ -1468,25 +1474,25 @@ def train_maxent_classifier_with_megam(
         inv_variance = 1.0 / gaussian_prior_sigma ** 2
     else:
         inv_variance = 0
-    options += ["-lambda", "%.2f" % inv_variance, "-tune"]
+    options += ['-lambda', '%.2f' % inv_variance, '-tune']
     if trace < 3:
-        options += ["-quiet"]
-    if "max_iter" in kwargs:
-        options += ["-maxi", "%s" % kwargs["max_iter"]]
-    if "ll_delta" in kwargs:
+        options += ['-quiet']
+    if 'max_iter' in kwargs:
+        options += ['-maxi', '%s' % kwargs['max_iter']]
+    if 'll_delta' in kwargs:
         # [xx] this is actually a perplexity delta, not a log
         # likelihood delta
-        options += ["-dpp", "%s" % abs(kwargs["ll_delta"])]
-    if hasattr(encoding, "cost"):
-        options += ["-multilabel"]  # each possible la
-    options += ["multiclass", trainfile_name]
+        options += ['-dpp', '%s' % abs(kwargs['ll_delta'])]
+    if hasattr(encoding, 'cost'):
+        options += ['-multilabel']  # each possible la
+    options += ['multiclass', trainfile_name]
     stdout = call_megam(options)
-    # print('./megam_i686.opt ', ' '.join(options))
+    # print './megam_i686.opt ', ' '.join(options)
     # Delete the training file
     try:
         os.remove(trainfile_name)
     except (OSError, IOError) as e:
-        print("Warning: unable to delete %s: %s" % (trainfile_name, e))
+        print('Warning: unable to delete %s: %s' % (trainfile_name, e))
 
     # Parse the generated weight vector.
     weights = parse_megam_weights(stdout, encoding.length(), explicit)
@@ -1506,14 +1512,14 @@ def train_maxent_classifier_with_megam(
 class TadmMaxentClassifier(MaxentClassifier):
     @classmethod
     def train(cls, train_toks, **kwargs):
-        algorithm = kwargs.get("algorithm", "tao_lmvm")
-        trace = kwargs.get("trace", 3)
-        encoding = kwargs.get("encoding", None)
-        labels = kwargs.get("labels", None)
-        sigma = kwargs.get("gaussian_prior_sigma", 0)
-        count_cutoff = kwargs.get("count_cutoff", 0)
-        max_iter = kwargs.get("max_iter")
-        ll_delta = kwargs.get("min_lldelta")
+        algorithm = kwargs.get('algorithm', 'tao_lmvm')
+        trace = kwargs.get('trace', 3)
+        encoding = kwargs.get('encoding', None)
+        labels = kwargs.get('labels', None)
+        sigma = kwargs.get('gaussian_prior_sigma', 0)
+        count_cutoff = kwargs.get('count_cutoff', 0)
+        max_iter = kwargs.get('max_iter')
+        ll_delta = kwargs.get('min_lldelta')
 
         # Construct an encoding from the training data.
         if not encoding:
@@ -1522,33 +1528,33 @@ class TadmMaxentClassifier(MaxentClassifier):
             )
 
         trainfile_fd, trainfile_name = tempfile.mkstemp(
-            prefix="nltk-tadm-events-", suffix=".gz"
+            prefix='nltk-tadm-events-', suffix='.gz'
         )
-        weightfile_fd, weightfile_name = tempfile.mkstemp(prefix="nltk-tadm-weights-")
+        weightfile_fd, weightfile_name = tempfile.mkstemp(prefix='nltk-tadm-weights-')
 
-        trainfile = gzip_open_unicode(trainfile_name, "w")
+        trainfile = gzip_open_unicode(trainfile_name, 'w')
         write_tadm_file(train_toks, encoding, trainfile)
         trainfile.close()
 
         options = []
-        options.extend(["-monitor"])
-        options.extend(["-method", algorithm])
+        options.extend(['-monitor'])
+        options.extend(['-method', algorithm])
         if sigma:
-            options.extend(["-l2", "%.6f" % sigma ** 2])
+            options.extend(['-l2', '%.6f' % sigma ** 2])
         if max_iter:
-            options.extend(["-max_it", "%d" % max_iter])
+            options.extend(['-max_it', '%d' % max_iter])
         if ll_delta:
-            options.extend(["-fatol", "%.6f" % abs(ll_delta)])
-        options.extend(["-events_in", trainfile_name])
-        options.extend(["-params_out", weightfile_name])
+            options.extend(['-fatol', '%.6f' % abs(ll_delta)])
+        options.extend(['-events_in', trainfile_name])
+        options.extend(['-params_out', weightfile_name])
         if trace < 3:
-            options.extend(["2>&1"])
+            options.extend(['2>&1'])
         else:
-            options.extend(["-summary"])
+            options.extend(['-summary'])
 
         call_tadm(options)
 
-        with open(weightfile_name, "r") as weightfile:
+        with open(weightfile_name, 'r') as weightfile:
             weights = parse_tadm_weights(weightfile)
 
         os.remove(trainfile_name)
@@ -1570,5 +1576,5 @@ def demo():
     classifier = names_demo(MaxentClassifier.train)
 
 
-if __name__ == "__main__":
+if __name__ == '__main__':
     demo()
index 6a80b7d..f86d8aa 100644 (file)
@@ -1,6 +1,6 @@
 # Natural Language Toolkit: Interface to Megam Classifier
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Edward Loper <edloper@gmail.com>
 # URL: <http://nltk.org/>
 # For license information, see LICENSE.TXT
@@ -22,8 +22,13 @@ for details.
 
 .. _megam: http://www.umiacs.umd.edu/~hal/megam/index.html
 """
+from __future__ import print_function
+
 import subprocess
 
+from six import string_types
+
+from nltk import compat
 from nltk.internals import find_binary
 
 try:
@@ -50,11 +55,11 @@ def config_megam(bin=None):
     """
     global _megam_bin
     _megam_bin = find_binary(
-        "megam",
+        'megam',
         bin,
-        env_vars=["MEGAM"],
-        binary_names=["megam.opt", "megam", "megam_686", "megam_i686.opt"],
-        url="http://www.umiacs.umd.edu/~hal/megam/index.html",
+        env_vars=['MEGAM'],
+        binary_names=['megam.opt', 'megam', 'megam_686', 'megam_i686.opt'],
+        url='http://www.umiacs.umd.edu/~hal/megam/index.html',
     )
 
 
@@ -100,12 +105,12 @@ def write_megam_file(train_toks, encoding, stream, bernoulli=True, explicit=True
     # Write the file, which contains one line per instance.
     for featureset, label in train_toks:
         # First, the instance number (or, in the weighted multiclass case, the cost of each label).
-        if hasattr(encoding, "cost"):
+        if hasattr(encoding, 'cost'):
             stream.write(
-                ":".join(str(encoding.cost(featureset, label, l)) for l in labels)
+                ':'.join(str(encoding.cost(featureset, label, l)) for l in labels)
             )
         else:
-            stream.write("%d" % labelnum[label])
+            stream.write('%d' % labelnum[label])
 
         # For implicit file formats, just list the features that fire
         # for this instance's actual label.
@@ -116,11 +121,11 @@ def write_megam_file(train_toks, encoding, stream, bernoulli=True, explicit=True
         # any of the possible labels.
         else:
             for l in labels:
-                stream.write(" #")
+                stream.write(' #')
                 _write_megam_features(encoding.encode(featureset, l), stream, bernoulli)
 
         # End of the instance.
-        stream.write("\n")
+        stream.write('\n')
 
 
 def parse_megam_weights(s, features_count, explicit=True):
@@ -130,10 +135,10 @@ def parse_megam_weights(s, features_count, explicit=True):
     vector.  This function does not currently handle bias features.
     """
     if numpy is None:
-        raise ValueError("This function requires that numpy be installed")
-    assert explicit, "non-explicit not supported yet"
-    lines = s.strip().split("\n")
-    weights = numpy.zeros(features_count, "d")
+        raise ValueError('This function requires that numpy be installed')
+    assert explicit, 'non-explicit not supported yet'
+    lines = s.strip().split('\n')
+    weights = numpy.zeros(features_count, 'd')
     for line in lines:
         if line.strip():
             fid, weight = line.split()
@@ -144,26 +149,26 @@ def parse_megam_weights(s, features_count, explicit=True):
 def _write_megam_features(vector, stream, bernoulli):
     if not vector:
         raise ValueError(
-            "MEGAM classifier requires the use of an " "always-on feature."
+            'MEGAM classifier requires the use of an ' 'always-on feature.'
         )
     for (fid, fval) in vector:
         if bernoulli:
             if fval == 1:
-                stream.write(" %s" % fid)
+                stream.write(' %s' % fid)
             elif fval != 0:
                 raise ValueError(
-                    "If bernoulli=True, then all" "features must be binary."
+                    'If bernoulli=True, then all' 'features must be binary.'
                 )
         else:
-            stream.write(" %s %s" % (fid, fval))
+            stream.write(' %s %s' % (fid, fval))
 
 
 def call_megam(args):
     """
     Call the ``megam`` binary with the given arguments.
     """
-    if isinstance(args, str):
-        raise TypeError("args should be a list of strings")
+    if isinstance(args, string_types):
+        raise TypeError('args should be a list of strings')
     if _megam_bin is None:
         config_megam()
 
@@ -176,9 +181,9 @@ def call_megam(args):
     if p.returncode != 0:
         print()
         print(stderr)
-        raise OSError("megam command failed!")
+        raise OSError('megam command failed!')
 
-    if isinstance(stdout, str):
+    if isinstance(stdout, string_types):
         return stdout
     else:
-        return stdout.decode("utf-8")
+        return stdout.decode('utf-8')
index abfed1a..8859439 100644 (file)
@@ -1,6 +1,6 @@
 # Natural Language Toolkit: Naive Bayes Classifiers
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Edward Loper <edloper@gmail.com>
 # URL: <http://nltk.org/>
 # For license information, see LICENSE.TXT
@@ -29,6 +29,7 @@ sum to one:
 |  P(label|features) = --------------------------------------------
 |                        SUM[l]( P(l) * P(f1|l) * ... * P(fn|l) )
 """
+from __future__ import print_function, unicode_literals
 
 from collections import defaultdict
 
@@ -98,7 +99,7 @@ class NaiveBayesClassifier(ClassifierI):
                 if (label, fname) in self._feature_probdist:
                     break
             else:
-                # print('Ignoring unseen feature %s' % fname)
+                # print 'Ignoring unseen feature %s' % fname
                 del featureset[fname]
 
         # Find the log probabilty of each label, given the features.
@@ -124,7 +125,7 @@ class NaiveBayesClassifier(ClassifierI):
     def show_most_informative_features(self, n=10):
         # Determine the most relevant features, and display them.
         cpdist = self._feature_probdist
-        print("Most Informative Features")
+        print('Most Informative Features')
 
         for (fname, fval) in self.most_informative_features(n):
 
@@ -133,22 +134,21 @@ class NaiveBayesClassifier(ClassifierI):
 
             labels = sorted(
                 [l for l in self._labels if fval in cpdist[l, fname].samples()],
-                key=lambda element: (-labelprob(element), element),
-                reverse=True
+                key=labelprob,
             )
             if len(labels) == 1:
                 continue
             l0 = labels[0]
             l1 = labels[-1]
             if cpdist[l0, fname].prob(fval) == 0:
-                ratio = "INF"
+                ratio = 'INF'
             else:
-                ratio = "%8.1f" % (
+                ratio = '%8.1f' % (
                     cpdist[l1, fname].prob(fval) / cpdist[l0, fname].prob(fval)
                 )
             print(
                 (
-                    "%24s = %-14r %6s : %-6s = %s : 1.0"
+                    '%24s = %-14r %6s : %-6s = %s : 1.0'
                     % (fname, fval, ("%s" % l1)[:6], ("%s" % l0)[:6], ratio)
                 )
             )
@@ -163,7 +163,7 @@ class NaiveBayesClassifier(ClassifierI):
 
         |  max[ P(fname=fval|label1) / P(fname=fval|label2) ]
         """
-        if hasattr(self, "_most_informative_features"):
+        if hasattr(self, '_most_informative_features'):
             return self._most_informative_features[:n]
         else:
             # The set of (fname, fval) pairs used by this classifier.
@@ -186,8 +186,7 @@ class NaiveBayesClassifier(ClassifierI):
             # Convert features to a list, & sort it by how informative
             # features are.
             self._most_informative_features = sorted(
-                features, key=lambda feature_: (minprob[feature_] / maxprob[feature_], feature_[0],
-                                                feature_[1] in [None, False, True], str(feature_[1]).lower())
+                features, key=lambda feature_: minprob[feature_] / maxprob[feature_]
             )
         return self._most_informative_features[:n]
 
@@ -253,5 +252,5 @@ def demo():
     classifier.show_most_informative_features()
 
 
-if __name__ == "__main__":
+if __name__ == '__main__':
     demo()
index 58621f1..7d6cb15 100644 (file)
@@ -59,8 +59,8 @@ The features of a sentence are simply the words it contains:
 
 We use the sports sentences as positive examples, the mixed ones ad unlabeled examples:
 
-    >>> positive_featuresets = map(features, sports_sentences)
-    >>> unlabeled_featuresets = map(features, various_sentences)
+    >>> positive_featuresets = list(map(features, sports_sentences))
+    >>> unlabeled_featuresets = list(map(features, various_sentences))
     >>> classifier = PositiveNaiveBayesClassifier.train(positive_featuresets,
     ...                                                 unlabeled_featuresets)
 
@@ -95,10 +95,10 @@ class PositiveNaiveBayesClassifier(NaiveBayesClassifier):
         estimator=ELEProbDist,
     ):
         """
-        :param positive_featuresets: An iterable of featuresets that are known as positive
+        :param positive_featuresets: A list of featuresets that are known as positive
             examples (i.e., their label is ``True``).
 
-        :param unlabeled_featuresets: An iterable of featuresets whose label is unknown.
+        :param unlabeled_featuresets: A list of featuresets whose label is unknown.
 
         :param positive_prob_prior: A prior estimate of the probability of the label
             ``True`` (default 0.5).
@@ -109,30 +109,28 @@ class PositiveNaiveBayesClassifier(NaiveBayesClassifier):
         fnames = set()
 
         # Count up how many times each feature value occurred in positive examples.
-        num_positive_examples = 0
         for featureset in positive_featuresets:
             for fname, fval in featureset.items():
                 positive_feature_freqdist[fname][fval] += 1
                 feature_values[fname].add(fval)
                 fnames.add(fname)
-            num_positive_examples += 1
 
         # Count up how many times each feature value occurred in unlabeled examples.
-        num_unlabeled_examples = 0
         for featureset in unlabeled_featuresets:
             for fname, fval in featureset.items():
                 unlabeled_feature_freqdist[fname][fval] += 1
                 feature_values[fname].add(fval)
                 fnames.add(fname)
-            num_unlabeled_examples += 1
 
         # If a feature didn't have a value given for an instance, then we assume that
         # it gets the implicit value 'None'.
+        num_positive_examples = len(positive_featuresets)
         for fname in fnames:
             count = positive_feature_freqdist[fname].N()
             positive_feature_freqdist[fname][None] += num_positive_examples - count
             feature_values[fname].add(None)
 
+        num_unlabeled_examples = len(unlabeled_featuresets)
         for fname in fnames:
             count = unlabeled_feature_freqdist[fname].N()
             unlabeled_feature_freqdist[fname][None] += num_unlabeled_examples - count
index 0be8c81..19e1332 100644 (file)
@@ -1,6 +1,6 @@
 # Natural Language Toolkit: RTE Classifier
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Ewan Klein <ewan@inf.ed.ac.uk>
 # URL: <http://nltk.org/>
 # For license information, see LICENSE.TXT
@@ -16,6 +16,7 @@ the hypothesis is more informative than (i.e not entailed by) the text.
 TO DO: better Named Entity classification
 TO DO: add lemmatization
 """
+from __future__ import print_function
 
 from nltk.tokenize import RegexpTokenizer
 from nltk.classify.util import accuracy, check_megam_config
@@ -37,28 +38,28 @@ class RTEFeatureExtractor(object):
         self.stop = stop
         self.stopwords = set(
             [
-                "a",
-                "the",
-                "it",
-                "they",
-                "of",
-                "in",
-                "to",
-                "is",
-                "have",
-                "are",
-                "were",
-                "and",
-                "very",
-                ".",
-                ",",
+                'a',
+                'the',
+                'it',
+                'they',
+                'of',
+                'in',
+                'to',
+                'is',
+                'have',
+                'are',
+                'were',
+                'and',
+                'very',
+                '.',
+                ',',
             ]
         )
 
-        self.negwords = set(["no", "not", "never", "failed", "rejected", "denied"])
+        self.negwords = set(['no', 'not', 'never', 'failed', 'rejected', 'denied'])
         # Try to tokenize so that abbreviations, monetary amounts, email
         # addresses, URLs are single tokens.
-        tokenizer = RegexpTokenizer("[\w.@:/]+|\w+|\$[\d.]+")
+        tokenizer = RegexpTokenizer('[\w.@:/]+|\w+|\$[\d.]+')
 
         # Get the set of word types for text and hypothesis
         self.text_tokens = tokenizer.tokenize(rtepair.text)
@@ -86,11 +87,11 @@ class RTEFeatureExtractor(object):
         :type toktype: 'ne' or 'word'
         """
         ne_overlap = set(token for token in self._overlap if self._ne(token))
-        if toktype == "ne":
+        if toktype == 'ne':
             if debug:
                 print("ne overlap", ne_overlap)
             return ne_overlap
-        elif toktype == "word":
+        elif toktype == 'word':
             if debug:
                 print("word overlap", self._overlap - ne_overlap)
             return self._overlap - ne_overlap
@@ -105,9 +106,9 @@ class RTEFeatureExtractor(object):
         :type toktype: 'ne' or 'word'
         """
         ne_extra = set(token for token in self._hyp_extra if self._ne(token))
-        if toktype == "ne":
+        if toktype == 'ne':
             return ne_extra
-        elif toktype == "word":
+        elif toktype == 'word':
             return self._hyp_extra - ne_extra
         else:
             raise ValueError("Type not recognized: '%s'" % toktype)
@@ -138,13 +139,13 @@ class RTEFeatureExtractor(object):
 def rte_features(rtepair):
     extractor = RTEFeatureExtractor(rtepair)
     features = {}
-    features["alwayson"] = True
-    features["word_overlap"] = len(extractor.overlap("word"))
-    features["word_hyp_extra"] = len(extractor.hyp_extra("word"))
-    features["ne_overlap"] = len(extractor.overlap("ne"))
-    features["ne_hyp_extra"] = len(extractor.hyp_extra("ne"))
-    features["neg_txt"] = len(extractor.negwords & extractor.text_words)
-    features["neg_hyp"] = len(extractor.negwords & extractor.hyp_words)
+    features['alwayson'] = True
+    features['word_overlap'] = len(extractor.overlap('word'))
+    features['word_hyp_extra'] = len(extractor.hyp_extra('word'))
+    features['ne_overlap'] = len(extractor.overlap('ne'))
+    features['ne_hyp_extra'] = len(extractor.hyp_extra('ne'))
+    features['neg_txt'] = len(extractor.negwords & extractor.text_words)
+    features['neg_hyp'] = len(extractor.negwords & extractor.hyp_words)
     return features
 
 
@@ -155,17 +156,17 @@ def rte_featurize(rte_pairs):
 def rte_classifier(algorithm):
     from nltk.corpus import rte as rte_corpus
 
-    train_set = rte_corpus.pairs(["rte1_dev.xml", "rte2_dev.xml", "rte3_dev.xml"])
-    test_set = rte_corpus.pairs(["rte1_test.xml", "rte2_test.xml", "rte3_test.xml"])
+    train_set = rte_corpus.pairs(['rte1_dev.xml', 'rte2_dev.xml', 'rte3_dev.xml'])
+    test_set = rte_corpus.pairs(['rte1_test.xml', 'rte2_test.xml', 'rte3_test.xml'])
     featurized_train_set = rte_featurize(train_set)
     featurized_test_set = rte_featurize(test_set)
     # Train the classifier
-    print("Training classifier...")
-    if algorithm in ["megam", "BFGS"]:  # MEGAM based algorithms.
+    print('Training classifier...')
+    if algorithm in ['megam', 'BFGS']:  # MEGAM based algorithms.
         # Ensure that MEGAM is configured first.
         check_megam_config()
         clf = lambda x: MaxentClassifier.train(featurized_train_set, algorithm)
-    elif algorithm in ["GIS", "IIS"]:  # Use default GIS/IIS MaxEnt algorithm
+    elif algorithm in ['GIS', 'IIS']:  # Use default GIS/IIS MaxEnt algorithm
         clf = MaxentClassifier.train(featurized_train_set, algorithm)
     else:
         err_msg = str(
@@ -173,7 +174,7 @@ def rte_classifier(algorithm):
             "'megam', 'BFGS', 'GIS', 'IIS'.\n"
         )
         raise Exception(err_msg)
-    print("Testing classifier...")
+    print('Testing classifier...')
     acc = accuracy(clf, featurized_test_set)
-    print("Accuracy: %6.4f" % acc)
+    print('Accuracy: %6.4f' % acc)
     return clf
index 90b450b..c00dcdc 100644 (file)
@@ -30,9 +30,13 @@ best 1000 features:
 ...                      ('nb', MultinomialNB())])
 >>> classif = SklearnClassifier(pipeline)
 """
+from __future__ import print_function, unicode_literals
+
+from six.moves import zip
 
 from nltk.classify.api import ClassifierI
 from nltk.probability import DictionaryProbDist
+from nltk import compat
 
 try:
     from sklearn.feature_extraction import DictVectorizer
@@ -40,9 +44,10 @@ try:
 except ImportError:
     pass
 
-__all__ = ["SklearnClassifier"]
+__all__ = ['SklearnClassifier']
 
 
+@compat.python_2_unicode_compatible
 class SklearnClassifier(ClassifierI):
     """Wrapper for scikit-learn classifiers."""
 
index 35bd402..0ccd29f 100644 (file)
@@ -1,7 +1,7 @@
 # encoding: utf-8
 # Natural Language Toolkit: Senna Interface
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Rami Al-Rfou' <ralrfou@cs.stonybrook.edu>
 # URL: <http://nltk.org/>
 # For license information, see LICENSE.TXT
@@ -29,6 +29,7 @@ The input is:
 
 Note: Unit tests for this module can be found in test/unit/test_senna.py
 
+    >>> from __future__ import unicode_literals
     >>> from nltk.classify import Senna
     >>> pipeline = Senna('/usr/share/senna-v3.0', ['pos', 'chk', 'ner'])
     >>> sent = 'Dusseldorf is an international business center'.split()
@@ -37,20 +38,26 @@ Note: Unit tests for this module can be found in test/unit/test_senna.py
     ('international', 'I-NP', 'O', 'JJ'), ('business', 'I-NP', 'O', 'NN'), ('center', 'I-NP', 'O', 'NN')]
 """
 
+
+from __future__ import unicode_literals
 from os import path, sep, environ
 from subprocess import Popen, PIPE
 from platform import architecture, system
 
+from six import text_type
+
 from nltk.tag.api import TaggerI
+from nltk.compat import python_2_unicode_compatible
 
-_senna_url = "http://ml.nec-labs.com/senna/"
+_senna_url = 'http://ml.nec-labs.com/senna/'
 
 
+@python_2_unicode_compatible
 class Senna(TaggerI):
 
-    SUPPORTED_OPERATIONS = ["pos", "chk", "ner"]
+    SUPPORTED_OPERATIONS = ['pos', 'chk', 'ner']
 
-    def __init__(self, senna_path, operations, encoding="utf-8"):
+    def __init__(self, senna_path, operations, encoding='utf-8'):
         self._encoding = encoding
         self._path = path.normpath(senna_path) + sep
 
@@ -59,9 +66,9 @@ class Senna(TaggerI):
         exe_file_1 = self.executable(self._path)
         if not path.isfile(exe_file_1):
             # Check for the system environment
-            if "SENNA" in environ:
+            if 'SENNA' in environ:
                 # self._path = path.join(environ['SENNA'],'')
-                self._path = path.normpath(environ["SENNA"]) + sep
+                self._path = path.normpath(environ['SENNA']) + sep
                 exe_file_2 = self.executable(self._path)
                 if not path.isfile(exe_file_2):
                     raise OSError(
@@ -78,16 +85,16 @@ class Senna(TaggerI):
         be used.
         """
         os_name = system()
-        if os_name == "Linux":
+        if os_name == 'Linux':
             bits = architecture()[0]
-            if bits == "64bit":
-                return path.join(base_path, "senna-linux64")
-            return path.join(base_path, "senna-linux32")
-        if os_name == "Windows":
-            return path.join(base_path, "senna-win32.exe")
-        if os_name == "Darwin":
-            return path.join(base_path, "senna-osx")
-        return path.join(base_path, "senna")
+            if bits == '64bit':
+                return path.join(base_path, 'senna-linux64')
+            return path.join(base_path, 'senna-linux32')
+        if os_name == 'Windows':
+            return path.join(base_path, 'senna-win32.exe')
+        if os_name == 'Darwin':
+            return path.join(base_path, 'senna-osx')
+        return path.join(base_path, 'senna')
 
     def _map(self):
         """
@@ -125,16 +132,16 @@ class Senna(TaggerI):
         # Build the senna command to run the tagger
         _senna_cmd = [
             self.executable(self._path),
-            "-path",
+            '-path',
             self._path,
-            "-usrtokens",
-            "-iobtags",
+            '-usrtokens',
+            '-iobtags',
         ]
-        _senna_cmd.extend(["-" + op for op in self.operations])
+        _senna_cmd.extend(['-' + op for op in self.operations])
 
         # Serialize the actual sentences to a temporary string
-        _input = "\n".join((" ".join(x) for x in sentences)) + "\n"
-        if isinstance(_input, str) and encoding:
+        _input = '\n'.join((' '.join(x) for x in sentences)) + '\n'
+        if isinstance(_input, text_type) and encoding:
             _input = _input.encode(encoding)
 
         # Run the tagger and get the output
@@ -144,7 +151,7 @@ class Senna(TaggerI):
 
         # Check the return code.
         if p.returncode != 0:
-            raise RuntimeError("Senna command failed! Details: %s" % stderr)
+            raise RuntimeError('Senna command failed! Details: %s' % stderr)
 
         if encoding:
             senna_output = stdout.decode(encoding)
@@ -160,12 +167,12 @@ class Senna(TaggerI):
                 sentence_index += 1
                 token_index = 0
                 continue
-            tags = tagged_word.split("\t")
+            tags = tagged_word.split('\t')
             result = {}
             for tag in map_:
                 result[tag] = tags[map_[tag]].strip()
             try:
-                result["word"] = sentences[sentence_index][token_index]
+                result['word'] = sentences[sentence_index][token_index]
             except IndexError:
                 raise IndexError(
                     "Misalignment error occurred at sentence number %d. Possible reason"
@@ -183,6 +190,6 @@ def setup_module(module):
     from nose import SkipTest
 
     try:
-        tagger = Senna("/usr/share/senna-v3.0", ["pos", "chk", "ner"])
+        tagger = Senna('/usr/share/senna-v3.0', ['pos', 'chk', 'ner'])
     except OSError:
         raise SkipTest("Senna executable not found")
index 544f859..b6e0b3a 100644 (file)
@@ -1,6 +1,6 @@
 # Natural Language Toolkit: SVM-based classifier
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Leon Derczynski <leon@dcs.shef.ac.uk>
 #
 # URL: <http://nltk.org/>
index 8780699..a2f8daf 100644 (file)
@@ -1,13 +1,16 @@
 # Natural Language Toolkit: Interface to TADM Classifier
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Joseph Frazee <jfrazee@mail.utexas.edu>
 # URL: <http://nltk.org/>
 # For license information, see LICENSE.TXT
+from __future__ import print_function, unicode_literals
 
 import sys
 import subprocess
 
+from six import string_types
+
 from nltk.internals import find_binary
 
 try:
@@ -21,7 +24,7 @@ _tadm_bin = None
 def config_tadm(bin=None):
     global _tadm_bin
     _tadm_bin = find_binary(
-        "tadm", bin, env_vars=["TADM"], binary_names=["tadm"], url="http://tadm.sf.net"
+        'tadm', bin, env_vars=['TADM'], binary_names=['tadm'], url='http://tadm.sf.net'
     )
 
 
@@ -47,14 +50,14 @@ def write_tadm_file(train_toks, encoding, stream):
     # http://sf.net/forum/forum.php?thread_id=1675097&forum_id=473054
     labels = encoding.labels()
     for featureset, label in train_toks:
-        length_line = "%d\n" % len(labels)
+        length_line = '%d\n' % len(labels)
         stream.write(length_line)
         for known_label in labels:
             v = encoding.encode(featureset, known_label)
-            line = "%d %d %s\n" % (
+            line = '%d %d %s\n' % (
                 int(label == known_label),
                 len(v),
-                " ".join("%d %d" % u for u in v),
+                ' '.join('%d %d' % u for u in v),
             )
             stream.write(line)
 
@@ -68,15 +71,15 @@ def parse_tadm_weights(paramfile):
     weights = []
     for line in paramfile:
         weights.append(float(line.strip()))
-    return numpy.array(weights, "d")
+    return numpy.array(weights, 'd')
 
 
 def call_tadm(args):
     """
     Call the ``tadm`` binary with the given arguments.
     """
-    if isinstance(args, str):
-        raise TypeError("args should be a list of strings")
+    if isinstance(args, string_types):
+        raise TypeError('args should be a list of strings')
     if _tadm_bin is None:
         config_tadm()
 
@@ -89,7 +92,7 @@ def call_tadm(args):
     if p.returncode != 0:
         print()
         print(stderr)
-        raise OSError("tadm command failed!")
+        raise OSError('tadm command failed!')
 
 
 def names_demo():
@@ -104,18 +107,18 @@ def encoding_demo():
     from nltk.classify.maxent import TadmEventMaxentFeatureEncoding
 
     tokens = [
-        ({"f0": 1, "f1": 1, "f3": 1}, "A"),
-        ({"f0": 1, "f2": 1, "f4": 1}, "B"),
-        ({"f0": 2, "f2": 1, "f3": 1, "f4": 1}, "A"),
+        ({'f0': 1, 'f1': 1, 'f3': 1}, 'A'),
+        ({'f0': 1, 'f2': 1, 'f4': 1}, 'B'),
+        ({'f0': 2, 'f2': 1, 'f3': 1, 'f4': 1}, 'A'),
     ]
     encoding = TadmEventMaxentFeatureEncoding.train(tokens)
     write_tadm_file(tokens, encoding, sys.stdout)
     print()
     for i in range(encoding.length()):
-        print("%s --> %d" % (encoding.describe(i), i))
+        print('%s --> %d' % (encoding.describe(i), i))
     print()
 
 
-if __name__ == "__main__":
+if __name__ == '__main__':
     encoding_demo()
     names_demo()
index 97545d5..b217fa8 100644 (file)
@@ -1,7 +1,7 @@
 # -*- coding: utf-8 -*-
 # Natural Language Toolkit: Language ID module using TextCat algorithm
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Avital Pekker <avital.pekker@utoronto.ca>
 #
 # URL: <http://nltk.org/>
@@ -18,7 +18,7 @@ n-gram frequencies to profile languages and text-yet to
 be identified-then compares using a distance measure.
 
 Language n-grams are provided by the "An Crubadan"
-project. A corpus reader was created separately to read
+project. A corpus reader was created seperately to read
 those files.
 
 For details regarding the algorithm, see:
@@ -28,10 +28,17 @@ For details about An Crubadan, see:
 http://borel.slu.edu/crubadan/index.html
 """
 
-from sys import maxsize
+# Ensure that literal strings default to unicode rather than str.
+from __future__ import print_function, unicode_literals
 
+from nltk.compat import PY3
 from nltk.util import trigrams
 
+if PY3:
+    from sys import maxsize
+else:
+    from sys import maxint
+
 # Note: this is NOT "re" you're likely used to. The regex module
 # is an alternative to the standard re module that supports
 # Unicode codepoint properties with the \p{} syntax.
@@ -71,11 +78,11 @@ class TextCat(object):
             self._corpus.lang_freq(lang)
 
     def remove_punctuation(self, text):
-        """ Get rid of punctuation except apostrophes """
+        ''' Get rid of punctuation except apostrophes '''
         return re.sub(r"[^\P{P}\']+", "", text)
 
     def profile(self, text):
-        """ Create FreqDist of trigrams within text """
+        ''' Create FreqDist of trigrams within text '''
         from nltk import word_tokenize, FreqDist
 
         clean_text = self.remove_punctuation(text)
@@ -84,7 +91,7 @@ class TextCat(object):
         fingerprint = FreqDist()
         for t in tokens:
             token_trigram_tuples = trigrams(self._START_CHAR + t + self._END_CHAR)
-            token_trigrams = ["".join(tri) for tri in token_trigram_tuples]
+            token_trigrams = [''.join(tri) for tri in token_trigram_tuples]
 
             for cur_trigram in token_trigrams:
                 if cur_trigram in fingerprint:
@@ -95,8 +102,8 @@ class TextCat(object):
         return fingerprint
 
     def calc_dist(self, lang, trigram, text_profile):
-        """ Calculate the "out-of-place" measure between the
-            text and language profile for a single trigram """
+        ''' Calculate the "out-of-place" measure between the
+            text and language profile for a single trigram '''
 
         lang_fd = self._corpus.lang_freq(lang)
         dist = 0
@@ -111,13 +118,16 @@ class TextCat(object):
             # Arbitrary but should be larger than
             # any possible trigram file length
             # in terms of total lines
-            dist = maxsize
+            if PY3:
+                dist = maxsize
+            else:
+                dist = maxint
 
         return dist
 
     def lang_dists(self, text):
-        """ Calculate the "out-of-place" measure between
-            the text and all languages """
+        ''' Calculate the "out-of-place" measure between
+            the text and all languages '''
 
         distances = {}
         profile = self.profile(text)
@@ -134,8 +144,8 @@ class TextCat(object):
         return distances
 
     def guess_language(self, text):
-        """ Find the language with the min distance
-            to the text and return its ISO 639-3 code """
+        ''' Find the language with the min distance
+            to the text and return its ISO 639-3 code '''
         self.last_distances = self.lang_dists(text)
 
         return min(self.last_distances, key=self.last_distances.get)
@@ -146,27 +156,27 @@ def demo():
     from nltk.corpus import udhr
 
     langs = [
-        "Kurdish-UTF8",
-        "Abkhaz-UTF8",
-        "Farsi_Persian-UTF8",
-        "Hindi-UTF8",
-        "Hawaiian-UTF8",
-        "Russian-UTF8",
-        "Vietnamese-UTF8",
-        "Serbian_Srpski-UTF8",
-        "Esperanto-UTF8",
+        'Kurdish-UTF8',
+        'Abkhaz-UTF8',
+        'Farsi_Persian-UTF8',
+        'Hindi-UTF8',
+        'Hawaiian-UTF8',
+        'Russian-UTF8',
+        'Vietnamese-UTF8',
+        'Serbian_Srpski-UTF8',
+        'Esperanto-UTF8',
     ]
 
     friendly = {
-        "kmr": "Northern Kurdish",
-        "abk": "Abkhazian",
-        "pes": "Iranian Persian",
-        "hin": "Hindi",
-        "haw": "Hawaiian",
-        "rus": "Russian",
-        "vie": "Vietnamese",
-        "srp": "Serbian",
-        "epo": "Esperanto",
+        'kmr': 'Northern Kurdish',
+        'abk': 'Abkhazian',
+        'pes': 'Iranian Persian',
+        'hin': 'Hindi',
+        'haw': 'Hawaiian',
+        'rus': 'Russian',
+        'vie': 'Vietnamese',
+        'srp': 'Serbian',
+        'epo': 'Esperanto',
     }
 
     tc = TextCat()
@@ -177,22 +187,22 @@ def demo():
         rows = len(raw_sentences) - 1
         cols = list(map(len, raw_sentences))
 
-        sample = ""
+        sample = ''
 
         # Generate a sample text of the language
         for i in range(0, rows):
-            cur_sent = ""
+            cur_sent = ''
             for j in range(0, cols[i]):
-                cur_sent += " " + raw_sentences[i][j]
+                cur_sent += ' ' + raw_sentences[i][j]
 
             sample += cur_sent
 
         # Try to detect what it is
-        print("Language snippet: " + sample[0:140] + "...")
+        print('Language snippet: ' + sample[0:140] + '...')
         guess = tc.guess_language(sample)
-        print("Language detection: %s (%s)" % (guess, friendly[guess]))
-        print("#" * 140)
+        print('Language detection: %s (%s)' % (guess, friendly[guess]))
+        print('#' * 140)
 
 
-if __name__ == "__main__":
+if __name__ == '__main__':
     demo()
index b9d1986..a0a15a6 100644 (file)
@@ -1,6 +1,6 @@
 # Natural Language Toolkit: Classifier Utility Functions
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Edward Loper <edloper@gmail.com>
 #         Steven Bird <stevenbird1@gmail.com> (minor additions)
 # URL: <http://nltk.org/>
@@ -9,6 +9,7 @@
 """
 Utility functions and classes for classifiers.
 """
+from __future__ import print_function, division
 
 import math
 
@@ -107,10 +108,10 @@ class CutoffChecker(object):
 
     def __init__(self, cutoffs):
         self.cutoffs = cutoffs.copy()
-        if "min_ll" in cutoffs:
-            cutoffs["min_ll"] = -abs(cutoffs["min_ll"])
-        if "min_lldelta" in cutoffs:
-            cutoffs["min_lldelta"] = abs(cutoffs["min_lldelta"])
+        if 'min_ll' in cutoffs:
+            cutoffs['min_ll'] = -abs(cutoffs['min_ll'])
+        if 'min_lldelta' in cutoffs:
+            cutoffs['min_lldelta'] = abs(cutoffs['min_lldelta'])
         self.ll = None
         self.acc = None
         self.iter = 1
@@ -118,32 +119,32 @@ class CutoffChecker(object):
     def check(self, classifier, train_toks):
         cutoffs = self.cutoffs
         self.iter += 1
-        if "max_iter" in cutoffs and self.iter >= cutoffs["max_iter"]:
+        if 'max_iter' in cutoffs and self.iter >= cutoffs['max_iter']:
             return True  # iteration cutoff.
 
         new_ll = nltk.classify.util.log_likelihood(classifier, train_toks)
         if math.isnan(new_ll):
             return True
 
-        if "min_ll" in cutoffs or "min_lldelta" in cutoffs:
-            if "min_ll" in cutoffs and new_ll >= cutoffs["min_ll"]:
+        if 'min_ll' in cutoffs or 'min_lldelta' in cutoffs:
+            if 'min_ll' in cutoffs and new_ll >= cutoffs['min_ll']:
                 return True  # log likelihood cutoff
             if (
-                "min_lldelta" in cutoffs
+                'min_lldelta' in cutoffs
                 and self.ll
-                and ((new_ll - self.ll) <= abs(cutoffs["min_lldelta"]))
+                and ((new_ll - self.ll) <= abs(cutoffs['min_lldelta']))
             ):
                 return True  # log likelihood delta cutoff
             self.ll = new_ll
 
-        if "max_acc" in cutoffs or "min_accdelta" in cutoffs:
+        if 'max_acc' in cutoffs or 'min_accdelta' in cutoffs:
             new_acc = nltk.classify.util.log_likelihood(classifier, train_toks)
-            if "max_acc" in cutoffs and new_acc >= cutoffs["max_acc"]:
+            if 'max_acc' in cutoffs and new_acc >= cutoffs['max_acc']:
                 return True  # log likelihood cutoff
             if (
-                "min_accdelta" in cutoffs
+                'min_accdelta' in cutoffs
                 and self.acc
-                and ((new_acc - self.acc) <= abs(cutoffs["min_accdelta"]))
+                and ((new_acc - self.acc) <= abs(cutoffs['min_accdelta']))
             ):
                 return True  # log likelihood delta cutoff
             self.acc = new_acc
@@ -158,25 +159,25 @@ class CutoffChecker(object):
 
 def names_demo_features(name):
     features = {}
-    features["alwayson"] = True
-    features["startswith"] = name[0].lower()
-    features["endswith"] = name[-1].lower()
-    for letter in "abcdefghijklmnopqrstuvwxyz":
-        features["count(%s)" % letter] = name.lower().count(letter)
-        features["has(%s)" % letter] = letter in name.lower()
+    features['alwayson'] = True
+    features['startswith'] = name[0].lower()
+    features['endswith'] = name[-1].lower()
+    for letter in 'abcdefghijklmnopqrstuvwxyz':
+        features['count(%s)' % letter] = name.lower().count(letter)
+        features['has(%s)' % letter] = letter in name.lower()
     return features
 
 
 def binary_names_demo_features(name):
     features = {}
-    features["alwayson"] = True
-    features["startswith(vowel)"] = name[0].lower() in "aeiouy"
-    features["endswith(vowel)"] = name[-1].lower() in "aeiouy"
-    for letter in "abcdefghijklmnopqrstuvwxyz":
-        features["count(%s)" % letter] = name.lower().count(letter)
-        features["has(%s)" % letter] = letter in name.lower()
-        features["startswith(%s)" % letter] = letter == name[0].lower()
-        features["endswith(%s)" % letter] = letter == name[-1].lower()
+    features['alwayson'] = True
+    features['startswith(vowel)'] = name[0].lower() in 'aeiouy'
+    features['endswith(vowel)'] = name[-1].lower() in 'aeiouy'
+    for letter in 'abcdefghijklmnopqrstuvwxyz':
+        features['count(%s)' % letter] = name.lower().count(letter)
+        features['has(%s)' % letter] = letter in name.lower()
+        features['startswith(%s)' % letter] = letter == name[0].lower()
+        features['endswith(%s)' % letter] = letter == name[-1].lower()
     return features
 
 
@@ -185,8 +186,8 @@ def names_demo(trainer, features=names_demo_features):
     import random
 
     # Construct a list of classified names, using the names corpus.
-    namelist = [(name, "male") for name in names.words("male.txt")] + [
-        (name, "female") for name in names.words("female.txt")
+    namelist = [(name, 'male') for name in names.words('male.txt')] + [
+        (name, 'female') for name in names.words('female.txt')
     ]
 
     # Randomly split the names into a test & train set.
@@ -196,13 +197,13 @@ def names_demo(trainer, features=names_demo_features):
     test = namelist[5000:5500]
 
     # Train up a classifier.
-    print("Training classifier...")
+    print('Training classifier...')
     classifier = trainer([(features(n), g) for (n, g) in train])
 
     # Run the classifier on the test data.
-    print("Testing classifier...")
+    print('Testing classifier...')
     acc = accuracy(classifier, [(features(n), g) for (n, g) in test])
-    print("Accuracy: %6.4f" % acc)
+    print('Accuracy: %6.4f' % acc)
 
     # For classifiers that can find probabilities, show the log
     # likelihood and some sample probability distributions.
@@ -210,15 +211,15 @@ def names_demo(trainer, features=names_demo_features):
         test_featuresets = [features(n) for (n, g) in test]
         pdists = classifier.prob_classify_many(test_featuresets)
         ll = [pdist.logprob(gold) for ((name, gold), pdist) in zip(test, pdists)]
-        print("Avg. log likelihood: %6.4f" % (sum(ll) / len(test)))
+        print('Avg. log likelihood: %6.4f' % (sum(ll) / len(test)))
         print()
-        print("Unseen Names      P(Male)  P(Female)\n" + "-" * 40)
+        print('Unseen Names      P(Male)  P(Female)\n' + '-' * 40)
         for ((name, gender), pdist) in list(zip(test, pdists))[:5]:
-            if gender == "male":
-                fmt = "  %-15s *%6.4f   %6.4f"
+            if gender == 'male':
+                fmt = '  %-15s *%6.4f   %6.4f'
             else:
-                fmt = "  %-15s  %6.4f  *%6.4f"
-            print(fmt % (name, pdist.prob("male"), pdist.prob("female")))
+                fmt = '  %-15s  %6.4f  *%6.4f'
+            print(fmt % (name, pdist.prob('male'), pdist.prob('female')))
     except NotImplementedError:
         pass
 
@@ -230,8 +231,8 @@ def partial_names_demo(trainer, features=names_demo_features):
     from nltk.corpus import names
     import random
 
-    male_names = names.words("male.txt")
-    female_names = names.words("female.txt")
+    male_names = names.words('male.txt')
+    female_names = names.words('female.txt')
 
     random.seed(654321)
     random.shuffle(male_names)
@@ -251,13 +252,13 @@ def partial_names_demo(trainer, features=names_demo_features):
     random.shuffle(test)
 
     # Train up a classifier.
-    print("Training classifier...")
+    print('Training classifier...')
     classifier = trainer(positive, unlabeled)
 
     # Run the classifier on the test data.
-    print("Testing classifier...")
+    print('Testing classifier...')
     acc = accuracy(classifier, [(features(n), m) for (n, m) in test])
-    print("Accuracy: %6.4f" % acc)
+    print('Accuracy: %6.4f' % acc)
 
     # For classifiers that can find probabilities, show the log
     # likelihood and some sample probability distributions.
@@ -265,14 +266,14 @@ def partial_names_demo(trainer, features=names_demo_features):
         test_featuresets = [features(n) for (n, m) in test]
         pdists = classifier.prob_classify_many(test_featuresets)
         ll = [pdist.logprob(gold) for ((name, gold), pdist) in zip(test, pdists)]
-        print("Avg. log likelihood: %6.4f" % (sum(ll) / len(test)))
+        print('Avg. log likelihood: %6.4f' % (sum(ll) / len(test)))
         print()
-        print("Unseen Names      P(Male)  P(Female)\n" + "-" * 40)
+        print('Unseen Names      P(Male)  P(Female)\n' + '-' * 40)
         for ((name, is_male), pdist) in zip(test, pdists)[:5]:
             if is_male == True:
-                fmt = "  %-15s *%6.4f   %6.4f"
+                fmt = '  %-15s *%6.4f   %6.4f'
             else:
-                fmt = "  %-15s  %6.4f  *%6.4f"
+                fmt = '  %-15s  %6.4f  *%6.4f'
             print(fmt % (name, pdist.prob(True), pdist.prob(False)))
     except NotImplementedError:
         pass
@@ -289,7 +290,7 @@ def wsd_demo(trainer, word, features, n=1000):
     import random
 
     # Get the instances.
-    print("Reading data...")
+    print('Reading data...')
     global _inst_cache
     if word not in _inst_cache:
         _inst_cache[word] = [(i, i.senses[0]) for i in senseval.instances(word)]
@@ -297,23 +298,23 @@ def wsd_demo(trainer, word, features, n=1000):
     if n > len(instances):
         n = len(instances)
     senses = list(set(l for (i, l) in instances))
-    print("  Senses: " + " ".join(senses))
+    print('  Senses: ' + ' '.join(senses))
 
     # Randomly split the names into a test & train set.
-    print("Splitting into test & train...")
+    print('Splitting into test & train...')
     random.seed(123456)
     random.shuffle(instances)
     train = instances[: int(0.8 * n)]
     test = instances[int(0.8 * n) : n]
 
     # Train up a classifier.
-    print("Training classifier...")
+    print('Training classifier...')
     classifier = trainer([(features(i), l) for (i, l) in train])
 
     # Run the classifier on the test data.
-    print("Testing classifier...")
+    print('Testing classifier...')
     acc = accuracy(classifier, [(features(i), l) for (i, l) in test])
-    print("Accuracy: %6.4f" % acc)
+    print('Accuracy: %6.4f' % acc)
 
     # For classifiers that can find probabilities, show the log
     # likelihood and some sample probability distributions.
@@ -321,7 +322,7 @@ def wsd_demo(trainer, word, features, n=1000):
         test_featuresets = [features(i) for (i, n) in test]
         pdists = classifier.prob_classify_many(test_featuresets)
         ll = [pdist.logprob(gold) for ((name, gold), pdist) in zip(test, pdists)]
-        print("Avg. log likelihood: %6.4f" % (sum(ll) / len(test)))
+        print('Avg. log likelihood: %6.4f' % (sum(ll) / len(test)))
     except NotImplementedError:
         pass
 
index 3bfb311..fbd4302 100644 (file)
@@ -1,6 +1,6 @@
 # Natural Language Toolkit: Interface to Weka Classsifiers
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Edward Loper <edloper@gmail.com>
 # URL: <http://nltk.org/>
 # For license information, see LICENSE.TXT
@@ -8,7 +8,7 @@
 """
 Classifiers that make use of the external 'Weka' package.
 """
-
+from __future__ import print_function
 import time
 import tempfile
 import os
@@ -17,6 +17,8 @@ import re
 import zipfile
 from sys import stdin
 
+from six import integer_types, string_types
+
 from nltk.probability import DictionaryProbDist
 from nltk.internals import java, config_java
 
@@ -24,11 +26,11 @@ from nltk.classify.api import ClassifierI
 
 _weka_classpath = None
 _weka_search = [
-    ".",
-    "/usr/share/weka",
-    "/usr/local/share/weka",
-    "/usr/lib/weka",
-    "/usr/local/lib/weka",
+    '.',
+    '/usr/share/weka',
+    '/usr/local/share/weka',
+    '/usr/lib/weka',
+    '/usr/local/lib/weka',
 ]
 
 
@@ -43,27 +45,27 @@ def config_weka(classpath=None):
 
     if _weka_classpath is None:
         searchpath = _weka_search
-        if "WEKAHOME" in os.environ:
-            searchpath.insert(0, os.environ["WEKAHOME"])
+        if 'WEKAHOME' in os.environ:
+            searchpath.insert(0, os.environ['WEKAHOME'])
 
         for path in searchpath:
-            if os.path.exists(os.path.join(path, "weka.jar")):
-                _weka_classpath = os.path.join(path, "weka.jar")
+            if os.path.exists(os.path.join(path, 'weka.jar')):
+                _weka_classpath = os.path.join(path, 'weka.jar')
                 version = _check_weka_version(_weka_classpath)
                 if version:
                     print(
-                        ("[Found Weka: %s (version %s)]" % (_weka_classpath, version))
+                        ('[Found Weka: %s (version %s)]' % (_weka_classpath, version))
                     )
                 else:
-                    print("[Found Weka: %s]" % _weka_classpath)
+                    print('[Found Weka: %s]' % _weka_classpath)
                 _check_weka_version(_weka_classpath)
 
     if _weka_classpath is None:
         raise LookupError(
-            "Unable to find weka.jar!  Use config_weka() "
-            "or set the WEKAHOME environment variable. "
-            "For more information about Weka, please see "
-            "http://www.cs.waikato.ac.nz/ml/weka/"
+            'Unable to find weka.jar!  Use config_weka() '
+            'or set the WEKAHOME environment variable. '
+            'For more information about Weka, please see '
+            'http://www.cs.waikato.ac.nz/ml/weka/'
         )
 
 
@@ -76,7 +78,7 @@ def _check_weka_version(jar):
         return None
     try:
         try:
-            return zf.read("weka/core/version.txt")
+            return zf.read('weka/core/version.txt')
         except KeyError:
             return None
     finally:
@@ -89,10 +91,10 @@ class WekaClassifier(ClassifierI):
         self._model = model_filename
 
     def prob_classify_many(self, featuresets):
-        return self._classify_many(featuresets, ["-p", "0", "-distribution"])
+        return self._classify_many(featuresets, ['-p', '0', '-distribution'])
 
     def classify_many(self, featuresets):
-        return self._classify_many(featuresets, ["-p", "0"])
+        return self._classify_many(featuresets, ['-p', '0'])
 
     def _classify_many(self, featuresets, options):
         # Make sure we can find java & weka.
@@ -101,15 +103,15 @@ class WekaClassifier(ClassifierI):
         temp_dir = tempfile.mkdtemp()
         try:
             # Write the test data file.
-            test_filename = os.path.join(temp_dir, "test.arff")
+            test_filename = os.path.join(temp_dir, 'test.arff')
             self._formatter.write(test_filename, featuresets)
 
             # Call weka to classify the data.
             cmd = [
-                "weka.classifiers.bayes.NaiveBayes",
-                "-l",
+                'weka.classifiers.bayes.NaiveBayes',
+                '-l',
                 self._model,
-                "-T",
+                '-T',
                 test_filename,
             ] + options
             (stdout, stderr) = java(
@@ -121,17 +123,17 @@ class WekaClassifier(ClassifierI):
 
             # Check if something went wrong:
             if stderr and not stdout:
-                if "Illegal options: -distribution" in stderr:
+                if 'Illegal options: -distribution' in stderr:
                     raise ValueError(
-                        "The installed version of weka does "
-                        "not support probability distribution "
-                        "output."
+                        'The installed version of weka does '
+                        'not support probability distribution '
+                        'output.'
                     )
                 else:
-                    raise ValueError("Weka failed to generate output:\n%s" % stderr)
+                    raise ValueError('Weka failed to generate output:\n%s' % stderr)
 
             # Parse weka's output.
-            return self.parse_weka_output(stdout.decode(stdin.encoding).split("\n"))
+            return self.parse_weka_output(stdout.decode(stdin.encoding).split('\n'))
 
         finally:
             for f in os.listdir(temp_dir):
@@ -139,7 +141,7 @@ class WekaClassifier(ClassifierI):
             os.rmdir(temp_dir)
 
     def parse_weka_distribution(self, s):
-        probs = [float(v) for v in re.split("[*,]+", s) if v.strip()]
+        probs = [float(v) for v in re.split('[*,]+', s) if v.strip()]
         probs = dict(zip(self._formatter.labels(), probs))
         return DictionaryProbDist(probs)
 
@@ -150,14 +152,14 @@ class WekaClassifier(ClassifierI):
                 lines = lines[i:]
                 break
 
-        if lines[0].split() == ["inst#", "actual", "predicted", "error", "prediction"]:
-            return [line.split()[2].split(":")[1] for line in lines[1:] if line.strip()]
+        if lines[0].split() == ['inst#', 'actual', 'predicted', 'error', 'prediction']:
+            return [line.split()[2].split(':')[1] for line in lines[1:] if line.strip()]
         elif lines[0].split() == [
-            "inst#",
-            "actual",
-            "predicted",
-            "error",
-            "distribution",
+            'inst#',
+            'actual',
+            'predicted',
+            'error',
+            'distribution',
         ]:
             return [
                 self.parse_weka_distribution(line.split()[-1])
@@ -166,16 +168,16 @@ class WekaClassifier(ClassifierI):
             ]
 
         # is this safe:?
-        elif re.match(r"^0 \w+ [01]\.[0-9]* \?\s*$", lines[0]):
+        elif re.match(r'^0 \w+ [01]\.[0-9]* \?\s*$', lines[0]):
             return [line.split()[1] for line in lines if line.strip()]
 
         else:
             for line in lines[:10]:
                 print(line)
             raise ValueError(
-                "Unhandled output format -- your version "
-                "of weka may not be supported.\n"
-                "  Header: %s" % lines[0]
+                'Unhandled output format -- your version '
+                'of weka may not be supported.\n'
+                '  Header: %s' % lines[0]
             )
 
     # [xx] full list of classifiers (some may be abstract?):
@@ -192,12 +194,12 @@ class WekaClassifier(ClassifierI):
     # VotedPerceptron, Winnow, ZeroR
 
     _CLASSIFIER_CLASS = {
-        "naivebayes": "weka.classifiers.bayes.NaiveBayes",
-        "C4.5": "weka.classifiers.trees.J48",
-        "log_regression": "weka.classifiers.functions.Logistic",
-        "svm": "weka.classifiers.functions.SMO",
-        "kstar": "weka.classifiers.lazy.KStar",
-        "ripper": "weka.classifiers.rules.JRip",
+        'naivebayes': 'weka.classifiers.bayes.NaiveBayes',
+        'C4.5': 'weka.classifiers.trees.J48',
+        'log_regression': 'weka.classifiers.functions.Logistic',
+        'svm': 'weka.classifiers.functions.SMO',
+        'kstar': 'weka.classifiers.lazy.KStar',
+        'ripper': 'weka.classifiers.rules.JRip',
     }
 
     @classmethod
@@ -205,7 +207,7 @@ class WekaClassifier(ClassifierI):
         cls,
         model_filename,
         featuresets,
-        classifier="naivebayes",
+        classifier='naivebayes',
         options=[],
         quiet=True,
     ):
@@ -218,7 +220,7 @@ class WekaClassifier(ClassifierI):
         temp_dir = tempfile.mkdtemp()
         try:
             # Write the training data file.
-            train_filename = os.path.join(temp_dir, "train.arff")
+            train_filename = os.path.join(temp_dir, 'train.arff')
             formatter.write(train_filename, featuresets)
 
             if classifier in cls._CLASSIFIER_CLASS:
@@ -226,10 +228,10 @@ class WekaClassifier(ClassifierI):
             elif classifier in cls._CLASSIFIER_CLASS.values():
                 javaclass = classifier
             else:
-                raise ValueError("Unknown classifier %s" % classifier)
+                raise ValueError('Unknown classifier %s' % classifier)
 
             # Train the weka model.
-            cmd = [javaclass, "-d", model_filename, "-t", train_filename]
+            cmd = [javaclass, '-d', model_filename, '-t', train_filename]
             cmd += list(options)
             if quiet:
                 stdout = subprocess.PIPE
@@ -276,8 +278,8 @@ class ARFF_Formatter:
 
     def write(self, outfile, tokens):
         """Writes ARFF data to a file for the given data."""
-        if not hasattr(outfile, "write"):
-            outfile = open(outfile, "w")
+        if not hasattr(outfile, 'write'):
+            outfile = open(outfile, 'w')
         outfile.write(self.format(tokens))
         outfile.close()
 
@@ -296,18 +298,18 @@ class ARFF_Formatter:
         for tok, label in tokens:
             for (fname, fval) in tok.items():
                 if issubclass(type(fval), bool):
-                    ftype = "{True, False}"
-                elif issubclass(type(fval), (int, float, bool)):
-                    ftype = "NUMERIC"
-                elif issubclass(type(fval), str):
-                    ftype = "STRING"
+                    ftype = '{True, False}'
+                elif issubclass(type(fval), (integer_types, float, bool)):
+                    ftype = 'NUMERIC'
+                elif issubclass(type(fval), string_types):
+                    ftype = 'STRING'
                 elif fval is None:
                     continue  # can't tell the type.
                 else:
-                    raise ValueError("Unsupported value type %r" % ftype)
+                    raise ValueError('Unsupported value type %r' % ftype)
 
                 if features.get(fname, ftype) != ftype:
-                    raise ValueError("Inconsistent type for %s" % fname)
+                    raise ValueError('Inconsistent type for %s' % fname)
                 features[fname] = ftype
         features = sorted(features.items())
 
@@ -317,20 +319,20 @@ class ARFF_Formatter:
         """Returns an ARFF header as a string."""
         # Header comment.
         s = (
-            "% Weka ARFF file\n"
-            + "% Generated automatically by NLTK\n"
-            + "%% %s\n\n" % time.ctime()
+            '% Weka ARFF file\n'
+            + '% Generated automatically by NLTK\n'
+            + '%% %s\n\n' % time.ctime()
         )
 
         # Relation name
-        s += "@RELATION rel\n\n"
+        s += '@RELATION rel\n\n'
 
         # Input attribute specifications
         for fname, ftype in self._features:
-            s += "@ATTRIBUTE %-30r %s\n" % (fname, ftype)
+            s += '@ATTRIBUTE %-30r %s\n' % (fname, ftype)
 
         # Label attribute specification
-        s += "@ATTRIBUTE %-30r {%s}\n" % ("-label-", ",".join(self._labels))
+        s += '@ATTRIBUTE %-30r {%s}\n' % ('-label-', ','.join(self._labels))
 
         return s
 
@@ -352,29 +354,29 @@ class ARFF_Formatter:
             tokens = [(tok, None) for tok in tokens]
 
         # Data section
-        s = "\n@DATA\n"
+        s = '\n@DATA\n'
         for (tok, label) in tokens:
             for fname, ftype in self._features:
-                s += "%s," % self._fmt_arff_val(tok.get(fname))
-            s += "%s\n" % self._fmt_arff_val(label)
+                s += '%s,' % self._fmt_arff_val(tok.get(fname))
+            s += '%s\n' % self._fmt_arff_val(label)
 
         return s
 
     def _fmt_arff_val(self, fval):
         if fval is None:
-            return "?"
-        elif isinstance(fval, (bool, int)):
-            return "%s" % fval
+            return '?'
+        elif isinstance(fval, (bool, integer_types)):
+            return '%s' % fval
         elif isinstance(fval, float):
-            return "%r" % fval
+            return '%r' % fval
         else:
-            return "%r" % fval
+            return '%r' % fval
 
 
-if __name__ == "__main__":
+if __name__ == '__main__':
     from nltk.classify.util import names_demo, binary_names_demo_features
 
     def make_classifier(featuresets):
-        return WekaClassifier.train("/tmp/name.model", featuresets, "C4.5")
+        return WekaClassifier.train('/tmp/name.model', featuresets, 'C4.5')
 
     classifier = names_demo(make_classifier, binary_names_demo_features)
diff --git a/nlp_resource_data/nltk/cli.py b/nlp_resource_data/nltk/cli.py
deleted file mode 100644 (file)
index 01ff3d0..0000000
+++ /dev/null
@@ -1,59 +0,0 @@
-# -*- coding: utf-8 -*-
-# Natural Language Toolkit: NLTK Command-Line Interface
-#
-# Copyright (C) 2001-2020 NLTK Project
-# URL: <http://nltk.org/>
-# For license information, see LICENSE.TXT
-
-
-from functools import partial
-from itertools import chain
-from tqdm import tqdm
-
-import click
-
-from nltk import word_tokenize
-from nltk.util import parallelize_preprocess
-
-CONTEXT_SETTINGS = dict(help_option_names=["-h", "--help"])
-
-
-@click.group(context_settings=CONTEXT_SETTINGS)
-@click.version_option()
-def cli():
-    pass
-
-
-@cli.command("tokenize")
-@click.option(
-    "--language",
-    "-l",
-    default="en",
-    help="The language for the Punkt sentence tokenization.",
-)
-@click.option(
-    "--preserve-line",
-    "-l",
-    default=True,
-    is_flag=True,
-    help="An option to keep the preserve the sentence and not sentence tokenize it.",
-)
-@click.option("--processes", "-j", default=1, help="No. of processes.")
-@click.option("--encoding", "-e", default="utf8", help="Specify encoding of file.")
-@click.option(
-    "--delimiter", "-d", default=" ", help="Specify delimiter to join the tokens."
-)
-def tokenize_file(language, preserve_line, processes, encoding, delimiter):
-    """ This command tokenizes text stream using nltk.word_tokenize """
-    with click.get_text_stream("stdin", encoding=encoding) as fin:
-        with click.get_text_stream("stdout", encoding=encoding) as fout:
-            # If it's single process, joblib parallization is slower,
-            # so just process line by line normally.
-            if processes == 1:
-                for line in tqdm(fin.readlines()):
-                    print(delimiter.join(word_tokenize(line)), end="\n", file=fout)
-            else:
-                for outline in parallelize_preprocess(
-                    word_tokenize, fin.readlines(), processes, progress_bar=True
-                ):
-                    print(delimiter.join(outline), end="\n", file=fout)
index 2310947..c7fc100 100644 (file)
@@ -1,6 +1,6 @@
 # Natural Language Toolkit: Clusterers
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Trevor Cohn <tacohn@cs.mu.oz.au>
 # URL: <http://nltk.org/>
 # For license information, see LICENSE.TXT
index a5235cb..ea33875 100644 (file)
Binary files a/nlp_resource_data/nltk/cluster/__pycache__/__init__.cpython-37.pyc and b/nlp_resource_data/nltk/cluster/__pycache__/__init__.cpython-37.pyc differ
index 0cbec0e..36c6670 100644 (file)
Binary files a/nlp_resource_data/nltk/cluster/__pycache__/api.cpython-37.pyc and b/nlp_resource_data/nltk/cluster/__pycache__/api.cpython-37.pyc differ
index 40edf72..ac4c457 100644 (file)
Binary files a/nlp_resource_data/nltk/cluster/__pycache__/em.cpython-37.pyc and b/nlp_resource_data/nltk/cluster/__pycache__/em.cpython-37.pyc differ
index 14f29d8..298944d 100644 (file)
Binary files a/nlp_resource_data/nltk/cluster/__pycache__/gaac.cpython-37.pyc and b/nlp_resource_data/nltk/cluster/__pycache__/gaac.cpython-37.pyc differ
index c9b1d4e..cbdf9f0 100644 (file)
Binary files a/nlp_resource_data/nltk/cluster/__pycache__/kmeans.cpython-37.pyc and b/nlp_resource_data/nltk/cluster/__pycache__/kmeans.cpython-37.pyc differ
index 5403a35..e3bc756 100644 (file)
Binary files a/nlp_resource_data/nltk/cluster/__pycache__/util.cpython-37.pyc and b/nlp_resource_data/nltk/cluster/__pycache__/util.cpython-37.pyc differ
index eb43b57..3f22f7f 100644 (file)
@@ -1,17 +1,19 @@
 # Natural Language Toolkit: Clusterer Interfaces
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Trevor Cohn <tacohn@cs.mu.oz.au>
 # Porting: Steven Bird <stevenbird1@gmail.com>
 # URL: <http://nltk.org/>
 # For license information, see LICENSE.TXT
 
 from abc import ABCMeta, abstractmethod
+from six import add_metaclass
 
 from nltk.probability import DictionaryProbDist
 
 
-class ClusterI(metaclass=ABCMeta):
+@add_metaclass(ABCMeta)
+class ClusterI(object):
     """
     Interface covering basic clustering functionality.
     """
index a93d19c..51dcf1f 100644 (file)
@@ -1,18 +1,21 @@
 # Natural Language Toolkit: Expectation Maximization Clusterer
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Trevor Cohn <tacohn@cs.mu.oz.au>
 # URL: <http://nltk.org/>
 # For license information, see LICENSE.TXT
+from __future__ import print_function, unicode_literals
 
 try:
     import numpy
 except ImportError:
     pass
 
+from nltk.compat import python_2_unicode_compatible
 from nltk.cluster.util import VectorSpaceClusterer
 
 
+@python_2_unicode_compatible
 class EMClusterer(VectorSpaceClusterer):
     """
     The Gaussian EM clusterer models the vectors as being produced by
@@ -95,7 +98,7 @@ class EMClusterer(VectorSpaceClusterer):
 
         while not converged:
             if trace:
-                print("iteration; loglikelihood", lastl)
+                print('iteration; loglikelihood', lastl)
             # E-step, calculate hidden variables, h[i,j]
             h = numpy.zeros((len(vectors), self._num_clusters), numpy.float64)
             for i in range(len(vectors)):
@@ -149,7 +152,7 @@ class EMClusterer(VectorSpaceClusterer):
 
     def _gaussian(self, mean, cvm, x):
         m = len(mean)
-        assert cvm.shape == (m, m), "bad sized covariance matrix, %s" % str(cvm.shape)
+        assert cvm.shape == (m, m), 'bad sized covariance matrix, %s' % str(cvm.shape)
         try:
             det = numpy.linalg.det(cvm)
             inv = numpy.linalg.inv(cvm)
@@ -173,7 +176,7 @@ class EMClusterer(VectorSpaceClusterer):
         return llh
 
     def __repr__(self):
-        return "<EMClusterer means=%s>" % list(self._means)
+        return '<EMClusterer means=%s>' % list(self._means)
 
 
 def demo():
@@ -191,28 +194,64 @@ def demo():
     clusterer = cluster.EMClusterer(means, bias=0.1)
     clusters = clusterer.cluster(vectors, True, trace=True)
 
-    print("Clustered:", vectors)
-    print("As:       ", clusters)
+    print('Clustered:', vectors)
+    print('As:       ', clusters)
     print()
 
     for c in range(2):
-        print("Cluster:", c)
-        print("Prior:  ", clusterer._priors[c])
-        print("Mean:   ", clusterer._means[c])
-        print("Covar:  ", clusterer._covariance_matrices[c])
+        print('Cluster:', c)
+        print('Prior:  ', clusterer._priors[c])
+        print('Mean:   ', clusterer._means[c])
+        print('Covar:  ', clusterer._covariance_matrices[c])
         print()
 
     # classify a new vector
     vector = numpy.array([2, 2])
-    print("classify(%s):" % vector, end=" ")
+    print('classify(%s):' % vector, end=' ')
     print(clusterer.classify(vector))
 
     # show the classification probabilities
     vector = numpy.array([2, 2])
-    print("classification_probdist(%s):" % vector)
+    print('classification_probdist(%s):' % vector)
     pdist = clusterer.classification_probdist(vector)
     for sample in pdist.samples():
-        print("%s => %.0f%%" % (sample, pdist.prob(sample) * 100))
+        print('%s => %.0f%%' % (sample, pdist.prob(sample) * 100))
 
-if __name__ == "__main__":
+
+#
+#     The following demo code is broken.
+#
+#     # use a set of tokens with 2D indices
+#     vectors = [numpy.array(f) for f in [[3, 3], [1, 2], [4, 2], [4, 0], [2, 3], [3, 1]]]
+
+#     # test the EM clusterer with means given by k-means (2) and
+#     # dimensionality reduction
+#     clusterer = cluster.KMeans(2, euclidean_distance, svd_dimensions=1)
+#     print 'Clusterer:', clusterer
+#     clusters = clusterer.cluster(vectors)
+#     means = clusterer.means()
+#     print 'Means:', clusterer.means()
+#     print
+
+#     clusterer = cluster.EMClusterer(means, svd_dimensions=1)
+#     clusters = clusterer.cluster(vectors, True)
+#     print 'Clusterer:', clusterer
+#     print 'Clustered:', str(vectors)[:60], '...'
+#     print 'As:', str(clusters)[:60], '...'
+#     print
+
+#     # classify a new vector
+#     vector = numpy.array([3, 3])
+#     print 'classify(%s):' % vector,
+#     print clusterer.classify(vector)
+#     print
+
+#     # show the classification probabilities
+#     vector = numpy.array([2.2, 2])
+#     print 'classification_probdist(%s)' % vector
+#     pdist = clusterer.classification_probdist(vector)
+#     for sample in pdist:
+#         print '%s => %.0f%%' % (sample, pdist.prob(sample) *100)
+
+if __name__ == '__main__':
     demo()
index 436ef98..06eb30e 100644 (file)
@@ -1,9 +1,10 @@
 # Natural Language Toolkit: Group Average Agglomerative Clusterer
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Trevor Cohn <tacohn@cs.mu.oz.au>
 # URL: <http://nltk.org/>
 # For license information, see LICENSE.TXT
+from __future__ import print_function, unicode_literals, division
 
 try:
     import numpy
@@ -11,8 +12,10 @@ except ImportError:
     pass
 
 from nltk.cluster.util import VectorSpaceClusterer, Dendrogram, cosine_distance
+from nltk.compat import python_2_unicode_compatible
 
 
+@python_2_unicode_compatible
 class GAAClusterer(VectorSpaceClusterer):
     """
     The Group Average Agglomerative starts with each of the N vectors as singleton
@@ -134,7 +137,7 @@ class GAAClusterer(VectorSpaceClusterer):
         return self._num_clusters
 
     def __repr__(self):
-        return "<GroupAverageAgglomerative Clusterer n=%d>" % self._num_clusters
+        return '<GroupAverageAgglomerative Clusterer n=%d>' % self._num_clusters
 
 
 def demo():
@@ -151,9 +154,9 @@ def demo():
     clusterer = GAAClusterer(4)
     clusters = clusterer.cluster(vectors, True)
 
-    print("Clusterer:", clusterer)
-    print("Clustered:", vectors)
-    print("As:", clusters)
+    print('Clusterer:', clusterer)
+    print('Clustered:', vectors)
+    print('As:', clusters)
     print()
 
     # show the dendrogram
@@ -161,10 +164,10 @@ def demo():
 
     # classify a new vector
     vector = numpy.array([3, 3])
-    print("classify(%s):" % vector, end=" ")
+    print('classify(%s):' % vector, end=' ')
     print(clusterer.classify(vector))
     print()
 
 
-if __name__ == "__main__":
+if __name__ == '__main__':
     demo()
index 389ff68..bfe1604 100644 (file)
@@ -1,9 +1,10 @@
 # Natural Language Toolkit: K-Means Clusterer
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Trevor Cohn <tacohn@cs.mu.oz.au>
 # URL: <http://nltk.org/>
 # For license information, see LICENSE.TXT
+from __future__ import print_function, unicode_literals, division
 
 import copy
 import random
@@ -16,8 +17,10 @@ except ImportError:
 
 
 from nltk.cluster.util import VectorSpaceClusterer
+from nltk.compat import python_2_unicode_compatible
 
 
+@python_2_unicode_compatible
 class KMeansClusterer(VectorSpaceClusterer):
     """
     The K-means clusterer starts with k arbitrary chosen means then allocates
@@ -80,12 +83,12 @@ class KMeansClusterer(VectorSpaceClusterer):
 
     def cluster_vectorspace(self, vectors, trace=False):
         if self._means and self._repeats > 1:
-            print("Warning: means will be discarded for subsequent trials")
+            print('Warning: means will be discarded for subsequent trials')
 
         meanss = []
         for trial in range(self._repeats):
             if trace:
-                print("k-means trial", trial)
+                print('k-means trial', trial)
             if not self._means or trial > 1:
                 self._means = self._rng.sample(list(vectors), self._num_means)
             self._cluster_vectorspace(vectors, trace)
@@ -123,7 +126,7 @@ class KMeansClusterer(VectorSpaceClusterer):
                     clusters[index].append(vector)
 
                 if trace:
-                    print("iteration")
+                    print('iteration')
                 # for i in range(self._num_means):
                 # print '  mean', i, 'allocated', len(clusters[i]), 'vectors'
 
@@ -175,9 +178,9 @@ class KMeansClusterer(VectorSpaceClusterer):
             return centroid / (1 + len(cluster))
         else:
             if not len(cluster):
-                sys.stderr.write("Error: no centroid defined for empty cluster.\n")
+                sys.stderr.write('Error: no centroid defined for empty cluster.\n')
                 sys.stderr.write(
-                    "Try setting argument 'avoid_empty_clusters' to True\n"
+                    'Try setting argument \'avoid_empty_clusters\' to True\n'
                 )
                 assert False
             centroid = copy.copy(cluster[0])
@@ -186,7 +189,7 @@ class KMeansClusterer(VectorSpaceClusterer):
             return centroid / len(cluster)
 
     def __repr__(self):
-        return "<KMeansClusterer means=%s repeats=%d>" % (self._means, self._repeats)
+        return '<KMeansClusterer means=%s repeats=%d>' % (self._means, self._repeats)
 
 
 #################################################################################
@@ -203,9 +206,9 @@ def demo():
     clusterer = KMeansClusterer(2, euclidean_distance, initial_means=means)
     clusters = clusterer.cluster(vectors, True, trace=True)
 
-    print("Clustered:", vectors)
-    print("As:", clusters)
-    print("Means:", clusterer.means())
+    print('Clustered:', vectors)
+    print('As:', clusters)
+    print('Means:', clusterer.means())
     print()
 
     vectors = [numpy.array(f) for f in [[3, 3], [1, 2], [4, 2], [4, 0], [2, 3], [3, 1]]]
@@ -215,17 +218,17 @@ def demo():
 
     clusterer = KMeansClusterer(2, euclidean_distance, repeats=10)
     clusters = clusterer.cluster(vectors, True)
-    print("Clustered:", vectors)
-    print("As:", clusters)
-    print("Means:", clusterer.means())
+    print('Clustered:', vectors)
+    print('As:', clusters)
+    print('Means:', clusterer.means())
     print()
 
     # classify a new vector
     vector = numpy.array([3, 3])
-    print("classify(%s):" % vector, end=" ")
+    print('classify(%s):' % vector, end=' ')
     print(clusterer.classify(vector))
     print()
 
 
-if __name__ == "__main__":
+if __name__ == '__main__':
     demo()
index c7ab691..a3576e7 100644 (file)
@@ -1,10 +1,11 @@
 # Natural Language Toolkit: Clusterer Utilities
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Trevor Cohn <tacohn@cs.mu.oz.au>
 # Contributor: J Richard Snape
 # URL: <http://nltk.org/>
 # For license information, see LICENSE.TXT
+from __future__ import print_function, unicode_literals, division
 from abc import abstractmethod
 
 import copy
@@ -17,6 +18,7 @@ except ImportError:
     pass
 
 from nltk.cluster.api import ClusterI
+from nltk.compat import python_2_unicode_compatible
 
 
 class VectorSpaceClusterer(ClusterI):
@@ -174,6 +176,7 @@ class _DendrogramNode(object):
         return cosine_distance(self._value, comparator._value) < 0
 
 
+@python_2_unicode_compatible
 class Dendrogram(object):
     """
     Represents a dendrogram, a tree with a specified branching order.  This
@@ -228,7 +231,7 @@ class Dendrogram(object):
         """
 
         # ASCII rendering characters
-        JOIN, HLINK, VLINK = "+", "-", "|"
+        JOIN, HLINK, VLINK = '+', '-', '|'
 
         # find the root (or create one)
         if len(self._items) > 1:
@@ -248,15 +251,15 @@ class Dendrogram(object):
         rhalf = int(width - lhalf - 1)
 
         # display functions
-        def format(centre, left=" ", right=" "):
-            return "%s%s%s" % (lhalf * left, centre, right * rhalf)
+        def format(centre, left=' ', right=' '):
+            return '%s%s%s' % (lhalf * left, centre, right * rhalf)
 
         def display(str):
             stdout.write(str)
 
         # for each merge, top down
         queue = [(root._value, root)]
-        verticals = [format(" ") for leaf in leaves]
+        verticals = [format(' ') for leaf in leaves]
         while queue:
             priority, node = queue.pop()
             child_left_leaf = list(map(lambda c: c.leaves(False)[0], node._children))
@@ -267,9 +270,9 @@ class Dendrogram(object):
             for i in range(len(leaves)):
                 if leaves[i] in child_left_leaf:
                     if i == min_idx:
-                        display(format(JOIN, " ", HLINK))
+                        display(format(JOIN, ' ', HLINK))
                     elif i == max_idx:
-                        display(format(JOIN, HLINK, " "))
+                        display(format(JOIN, HLINK, ' '))
                     else:
                         display(format(JOIN, HLINK, HLINK))
                     verticals[i] = format(VLINK)
@@ -277,7 +280,7 @@ class Dendrogram(object):
                     display(format(HLINK, HLINK, HLINK))
                 else:
                     display(verticals[i])
-            display("\n")
+            display('\n')
             for child in node._children:
                 if child._children:
                     queue.append((child._value, child))
@@ -285,11 +288,11 @@ class Dendrogram(object):
 
             for vertical in verticals:
                 display(vertical)
-            display("\n")
+            display('\n')
 
         # finally, display the last line
-        display("".join(item.center(width) for item in last_row))
-        display("\n")
+        display(''.join(item.center(width) for item in last_row))
+        display('\n')
 
     def __repr__(self):
         if len(self._items) > 1:
@@ -297,4 +300,4 @@ class Dendrogram(object):
         else:
             root = self._items[0]
         leaves = root.leaves(False)
-        return "<Dendrogram with %d leaves>" % len(leaves)
+        return '<Dendrogram with %d leaves>' % len(leaves)
index efbb78c..882e15c 100644 (file)
@@ -1,18 +1,21 @@
 # Natural Language Toolkit: Collections
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Steven Bird <stevenbird1@gmail.com>
 # URL: <http://nltk.org/>
 # For license information, see LICENSE.TXT
+from __future__ import print_function, absolute_import
 
 import bisect
 from itertools import islice, chain
 from functools import total_ordering
-
 # this unused import is for python 2.7
 from collections import defaultdict, deque, Counter
 
+from six import text_type
+
 from nltk.internals import slice_bounds, raise_unorderable_types
+from nltk.compat import python_2_unicode_compatible
 
 
 ##########################################################################
@@ -22,8 +25,8 @@ from nltk.internals import slice_bounds, raise_unorderable_types
 
 class OrderedDict(dict):
     def __init__(self, data=None, **kwargs):
-        self._keys = self.keys(data, kwargs.get("keys"))
-        self._default_factory = kwargs.get("default_factory")
+        self._keys = self.keys(data, kwargs.get('keys'))
+        self._default_factory = kwargs.get('default_factory')
         if data is None:
             dict.__init__(self)
         else:
@@ -81,7 +84,7 @@ class OrderedDict(dict):
                     return data.keys()
                 elif isinstance(data, list):
                     return [key for (key, value) in data]
-        elif "_keys" in self.__dict__:
+        elif '_keys' in self.__dict__:
             return self._keys
         else:
             return []
@@ -117,6 +120,7 @@ class OrderedDict(dict):
 
 
 @total_ordering
+@python_2_unicode_compatible
 class AbstractLazySequence(object):
     """
     An abstract base class for read-only sequences whose values are
@@ -143,7 +147,7 @@ class AbstractLazySequence(object):
         Return the number of tokens in the corpus file underlying this
         corpus view.
         """
-        raise NotImplementedError("should be implemented by subclass")
+        raise NotImplementedError('should be implemented by subclass')
 
     def iterate_from(self, start):
         """
@@ -152,7 +156,7 @@ class AbstractLazySequence(object):
         ``start``.  If ``start>=len(self)``, then this iterator will
         generate no tokens.
         """
-        raise NotImplementedError("should be implemented by subclass")
+        raise NotImplementedError('should be implemented by subclass')
 
     def __getitem__(self, i):
         """
@@ -167,12 +171,12 @@ class AbstractLazySequence(object):
             if i < 0:
                 i += len(self)
             if i < 0:
-                raise IndexError("index out of range")
+                raise IndexError('index out of range')
             # Use iterate_from to extract it.
             try:
                 return next(self.iterate_from(i))
             except StopIteration:
-                raise IndexError("index out of range")
+                raise IndexError('index out of range')
 
     def __iter__(self):
         """Return an iterator that generates the tokens in the corpus
@@ -192,7 +196,7 @@ class AbstractLazySequence(object):
         for i, elt in enumerate(islice(self, start, stop)):
             if elt == value:
                 return i + start
-        raise ValueError("index(x): x not in list")
+        raise ValueError('index(x): x not in list')
 
     def __contains__(self, value):
         """Return true if this list contains ``value``."""
@@ -228,8 +232,8 @@ class AbstractLazySequence(object):
             pieces.append(repr(elt))
             length += len(pieces[-1]) + 2
             if length > self._MAX_REPR_SIZE and len(pieces) > 2:
-                return "[%s, ...]" % ", ".join(pieces[:-1])
-        return "[%s]" % ", ".join(pieces)
+                return '[%s, ...]' % text_type(', ').join(pieces[:-1])
+        return '[%s]' % text_type(', ').join(pieces)
 
     def __eq__(self, other):
         return type(self) == type(other) and list(self) == list(other)
@@ -246,7 +250,7 @@ class AbstractLazySequence(object):
         """
         :raise ValueError: Corpus view objects are unhashable.
         """
-        raise ValueError("%s objects are unhashable" % self.__class__.__name__)
+        raise ValueError('%s objects are unhashable' % self.__class__.__name__)
 
 
 class LazySubsequence(AbstractLazySequence):
@@ -327,12 +331,12 @@ class LazyConcatenation(AbstractLazySequence):
             if sublist_index == (len(self._offsets) - 1):
                 assert (
                     index + len(sublist) >= self._offsets[-1]
-                ), "offests not monotonic increasing!"
+                ), 'offests not monotonic increasing!'
                 self._offsets.append(index + len(sublist))
             else:
                 assert self._offsets[sublist_index + 1] == index + len(
                     sublist
-                ), "inconsistent list value (num elts)"
+                ), 'inconsistent list value (num elts)'
 
             for value in sublist[max(0, start_index - index) :]:
                 yield value
@@ -386,11 +390,11 @@ class LazyMap(AbstractLazySequence):
             by this lazy map.  (default=5)
         """
         if not lists:
-            raise TypeError("LazyMap requires at least two args")
+            raise TypeError('LazyMap requires at least two args')
 
         self._lists = lists
         self._func = function
-        self._cache_size = config.get("cache_size", 5)
+        self._cache_size = config.get('cache_size', 5)
         self._cache = {} if self._cache_size > 0 else None
 
         # If you just take bool() of sum() here _all_lazy will be true just
@@ -457,7 +461,7 @@ class LazyMap(AbstractLazySequence):
             if index < 0:
                 index += len(self)
             if index < 0:
-                raise IndexError("index out of range")
+                raise IndexError('index out of range')
             # Check the cache
             if self._cache is not None and index in self._cache:
                 return self._cache[index]
@@ -465,7 +469,7 @@ class LazyMap(AbstractLazySequence):
             try:
                 val = next(self.iterate_from(index))
             except StopIteration:
-                raise IndexError("index out of range")
+                raise IndexError('index out of range')
             # Update the cache
             if self._cache is not None:
                 if len(self._cache) > self._cache_size:
index 150e29b..eb7bdda 100644 (file)
@@ -1,6 +1,6 @@
 # Natural Language Toolkit: Collocations and Association Measures
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Joel Nothman <jnothman@student.usyd.edu.au>
 # URL: <http://nltk.org>
 # For license information, see LICENSE.TXT
@@ -23,6 +23,7 @@ these functionalities, dependent on being provided a function which scores a
 ngram given appropriate frequency counts. A number of standard association
 measures are provided in bigram_measures and trigram_measures.
 """
+from __future__ import print_function
 
 # Possible TODOs:
 # - consider the distinction between f(x,_) and f(x) and whether our
@@ -31,17 +32,12 @@ measures are provided in bigram_measures and trigram_measures.
 #   and unigram counts (raw_freq, pmi, student_t)
 
 import itertools as _itertools
+from six import iteritems
 
 from nltk.probability import FreqDist
 from nltk.util import ngrams
-
 # these two unused imports are referenced in collocations.doctest
-from nltk.metrics import (
-    ContingencyMeasures,
-    BigramAssocMeasures,
-    TrigramAssocMeasures,
-    QuadgramAssocMeasures,
-)
+from nltk.metrics import ContingencyMeasures, BigramAssocMeasures, TrigramAssocMeasures
 from nltk.metrics.spearman import ranks_from_scores, spearman_correlation
 
 
@@ -65,9 +61,9 @@ class AbstractCollocationFinder(object):
     def _build_new_documents(
         cls, documents, window_size, pad_left=False, pad_right=False, pad_symbol=None
     ):
-        """
+        '''
         Pad the document with the place holder according to the window_size
-        """
+        '''
         padding = (pad_symbol,) * (window_size - 1)
         if pad_right:
             return _itertools.chain.from_iterable(
@@ -97,7 +93,7 @@ class AbstractCollocationFinder(object):
         if the function returns True when passed an ngram tuple.
         """
         tmp_ngram = FreqDist()
-        for ngram, freq in self.ngram_fd.items():
+        for ngram, freq in iteritems(self.ngram_fd):
             if not fn(ngram, freq):
                 tmp_ngram[ngram] = freq
         self.ngram_fd = tmp_ngram
@@ -367,7 +363,7 @@ def demo(scorer=None, compare_scorer=None):
 
     from nltk.corpus import stopwords, webtext
 
-    ignored_words = stopwords.words("english")
+    ignored_words = stopwords.words('english')
     word_filter = lambda w: len(w) < 3 or w.lower() in ignored_words
 
     for file in webtext.fileids():
@@ -382,31 +378,31 @@ def demo(scorer=None, compare_scorer=None):
             ranks_from_scores(cf.score_ngrams(compare_scorer)),
         )
         print(file)
-        print("\t", [" ".join(tup) for tup in cf.nbest(scorer, 15)])
-        print("\t Correlation to %s: %0.4f" % (compare_scorer.__name__, corr))
+        print('\t', [' '.join(tup) for tup in cf.nbest(scorer, 15)])
+        print('\t Correlation to %s: %0.4f' % (compare_scorer.__name__, corr))
 
 
 # Slows down loading too much
 # bigram_measures = BigramAssocMeasures()
 # trigram_measures = TrigramAssocMeasures()
 
-if __name__ == "__main__":
+if __name__ == '__main__':
     import sys
     from nltk.metrics import BigramAssocMeasures
 
     try:
-        scorer = eval("BigramAssocMeasures." + sys.argv[1])
+        scorer = eval('BigramAssocMeasures.' + sys.argv[1])
     except IndexError:
         scorer = None
     try:
-        compare_scorer = eval("BigramAssocMeasures." + sys.argv[2])
+        compare_scorer = eval('BigramAssocMeasures.' + sys.argv[2])
     except IndexError:
         compare_scorer = None
 
     demo(scorer, compare_scorer)
 
 __all__ = [
-    "BigramCollocationFinder",
-    "TrigramCollocationFinder",
-    "QuadgramCollocationFinder",
+    'BigramCollocationFinder',
+    'TrigramCollocationFinder',
+    'QuadgramCollocationFinder',
 ]
index 163a200..fef28a6 100644 (file)
 # -*- coding: utf-8 -*-
 # Natural Language Toolkit: Compatibility
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 #
 # URL: <http://nltk.org/>
 # For license information, see LICENSE.TXT
 
+from __future__ import absolute_import, print_function
 import os
-from functools import wraps
+import sys
+from functools import update_wrapper, wraps
+import fractions
+import unicodedata
+
+from six import string_types, text_type
+
+# Python 2/3 compatibility layer. Based on six.
+
+PY3 = sys.version_info[0] == 3
+
+if PY3:
+
+    def get_im_class(meth):
+        return meth.__self__.__class__
+
+    import io
+
+    StringIO = io.StringIO
+    BytesIO = io.BytesIO
+
+    from datetime import timezone
+
+    UTC = timezone.utc
+
+    from tempfile import TemporaryDirectory
+
+else:
+
+    def get_im_class(meth):
+        return meth.im_class
+
+    try:
+        from cStringIO import StringIO
+    except ImportError:
+        from StringIO import StringIO
+    BytesIO = StringIO
+
+    from datetime import tzinfo, timedelta
+
+    ZERO = timedelta(0)
+    HOUR = timedelta(hours=1)
+
+    # A UTC class for python 2.7
+    class UTC(tzinfo):
+        """UTC"""
+
+        def utcoffset(self, dt):
+            return ZERO
+
+        def tzname(self, dt):
+            return "UTC"
+
+        def dst(self, dt):
+            return ZERO
+
+    UTC = UTC()
+
+    import csv
+    import codecs
+    import cStringIO
+
+    class UnicodeWriter:
+        """
+        A CSV writer which will write rows to CSV file "f",
+        which is encoded in the given encoding.
+        see https://docs.python.org/2/library/csv.html
+        """
+
+        def __init__(
+            self, f, dialect=csv.excel, encoding="utf-8", errors='replace', **kwds
+        ):
+            # Redirect output to a queue
+            self.queue = cStringIO.StringIO()
+            self.writer = csv.writer(self.queue, dialect=dialect, **kwds)
+            self.stream = f
+            encoder_cls = codecs.getincrementalencoder(encoding)
+            self.encoder = encoder_cls(errors=errors)
+
+        def encode(self, data):
+            if isinstance(data, string_types):
+                return data.encode("utf-8")
+            else:
+                return data
+
+        def writerow(self, row):
+            self.writer.writerow([self.encode(s) for s in row])
+            # Fetch UTF-8 output from the queue ...
+            data = self.queue.getvalue()
+            data = data.decode("utf-8")
+            # ... and reencode it into the target encoding
+            data = self.encoder.encode(data, 'replace')
+            # write to the target stream
+            self.stream.write(data)
+            # empty queue
+            self.queue.truncate(0)
+
+    import warnings as _warnings
+    import os as _os
+    from tempfile import mkdtemp
+
+    class TemporaryDirectory(object):
+        """Create and return a temporary directory.  This has the same
+        behavior as mkdtemp but can be used as a context manager.  For
+        example:
+
+            with TemporaryDirectory() as tmpdir:
+                ...
+
+        Upon exiting the context, the directory and everything contained
+        in it are removed.
+
+        http://stackoverflow.com/questions/19296146/tempfile-temporarydirectory-context-manager-in-python-2-7
+        """
+
+        def __init__(self, suffix="", prefix="tmp", dir=None):
+            self._closed = False
+            self.name = None  # Handle mkdtemp raising an exception
+            self.name = mkdtemp(suffix, prefix, dir)
+
+        def __repr__(self):
+            return "<{} {!r}>".format(self.__class__.__name__, self.name)
+
+        def __enter__(self):
+            return self.name
+
+        def cleanup(self, _warn=False):
+            if self.name and not self._closed:
+                try:
+                    self._rmtree(self.name)
+                except (TypeError, AttributeError) as ex:
+                    # Issue #10188: Emit a warning on stderr
+                    # if the directory could not be cleaned
+                    # up due to missing globals
+                    if "None" not in str(ex):
+                        raise
+                    print(
+                        "ERROR: {!r} while cleaning up {!r}".format(ex, self),
+                        file=sys.stderr,
+                    )
+                    return
+                self._closed = True
+                if _warn:
+                    self._warn("Implicitly cleaning up {!r}".format(self), Warning)
+
+        def __exit__(self, exc, value, tb):
+            self.cleanup()
+
+        def __del__(self):
+            # Issue a Warning if implicit cleanup needed
+            self.cleanup(_warn=True)
+
+        # XXX (ncoghlan): The following code attempts to make
+        # this class tolerant of the module nulling out process
+        # that happens during CPython interpreter shutdown
+        # Alas, it doesn't actually manage it. See issue #10188
+        _listdir = staticmethod(_os.listdir)
+        _path_join = staticmethod(_os.path.join)
+        _isdir = staticmethod(_os.path.isdir)
+        _islink = staticmethod(_os.path.islink)
+        _remove = staticmethod(_os.remove)
+        _rmdir = staticmethod(_os.rmdir)
+        _warn = _warnings.warn
+
+        def _rmtree(self, path):
+            # Essentially a stripped down version of shutil.rmtree.  We can't
+            # use globals because they may be None'ed out at shutdown.
+            for name in self._listdir(path):
+                fullname = self._path_join(path, name)
+                try:
+                    isdir = self._isdir(fullname) and not self._islink(fullname)
+                except OSError:
+                    isdir = False
+                if isdir:
+                    self._rmtree(fullname)
+                else:
+                    try:
+                        self._remove(fullname)
+                    except OSError:
+                        pass
+            try:
+                self._rmdir(path)
+            except OSError:
+                pass
+
 
 # ======= Compatibility for datasets that care about Python versions ========
 
@@ -22,14 +207,16 @@ DATA_UPDATES = [
 
 _PY3_DATA_UPDATES = [os.path.join(*path_list) for path_list in DATA_UPDATES]
 
+
 def add_py3_data(path):
-    for item in _PY3_DATA_UPDATES:
-        if item in str(path) and "/PY3" not in str(path):
-            pos = path.index(item) + len(item)
-            if path[pos : pos + 4] == ".zip":
-                pos += 4
-            path = path[:pos] + "/PY3" + path[pos:]
-            break
+    if PY3:
+        for item in _PY3_DATA_UPDATES:
+            if item in str(path) and "/PY3" not in str(path):
+                pos = path.index(item) + len(item)
+                if path[pos : pos + 4] == ".zip":
+                    pos += 4
+                path = path[:pos] + "/PY3" + path[pos:]
+                break
     return path
 
 
@@ -41,3 +228,146 @@ def py3_data(init_func):
         return init_func(*args, **kwargs)
 
     return wraps(init_func)(_decorator)
+
+
+# ======= Compatibility layer for __str__ and __repr__ ==========
+def remove_accents(text):
+
+    if isinstance(text, bytes):
+        text = text.decode('ascii')
+
+    category = unicodedata.category  # this gives a small (~10%) speedup
+    return ''.join(
+        c for c in unicodedata.normalize('NFKD', text) if category(c) != 'Mn'
+    )
+
+
+# Select the best transliteration method:
+try:
+    # Older versions of Unidecode are licensed under Artistic License;
+    # assume an older version is installed.
+    from unidecode import unidecode as transliterate
+except ImportError:
+    try:
+        # text-unidecode implementation is worse than Unidecode
+        # implementation so Unidecode is preferred.
+        from text_unidecode import unidecode as transliterate
+    except ImportError:
+        # This transliteration method should be enough
+        # for many Western languages.
+        transliterate = remove_accents
+
+
+def python_2_unicode_compatible(klass):
+    """
+    This decorator defines __unicode__ method and fixes
+    __repr__ and __str__ methods under Python 2.
+
+    To support Python 2 and 3 with a single code base,
+    define __str__ and __repr__ methods returning unicode
+    text and apply this decorator to the class.
+
+    Original __repr__ and __str__ would be available
+    as unicode_repr and __unicode__ (under both Python 2
+    and Python 3).
+    """
+
+    if not issubclass(klass, object):
+        raise ValueError("This decorator doesn't work for old-style classes")
+
+    # both __unicode__ and unicode_repr are public because they
+    # may be useful in console under Python 2.x
+
+    # if __str__ or __repr__ are not overriden in a subclass,
+    # they may be already fixed by this decorator in a parent class
+    # and we shouldn't them again
+
+    if not _was_fixed(klass.__str__):
+        klass.__unicode__ = klass.__str__
+        if not PY3:
+            klass.__str__ = _7bit(_transliterated(klass.__unicode__))
+
+    if not _was_fixed(klass.__repr__):
+        klass.unicode_repr = klass.__repr__
+        if not PY3:
+            klass.__repr__ = _7bit(klass.unicode_repr)
+
+    return klass
+
+
+def unicode_repr(obj):
+    """
+    For classes that was fixed with @python_2_unicode_compatible
+    ``unicode_repr`` returns ``obj.unicode_repr()``; for unicode strings
+    the result is returned without "u" letter (to make output the
+    same under Python 2.x and Python 3.x); for other variables
+    it is the same as ``repr``.
+    """
+    if PY3:
+        return repr(obj)
+
+    # Python 2.x
+    if hasattr(obj, 'unicode_repr'):
+        return obj.unicode_repr()
+
+    if isinstance(obj, text_type):
+        return repr(obj)[1:]  # strip "u" letter from output
+
+    return repr(obj)
+
+
+def _transliterated(method):
+    def wrapper(self):
+        return transliterate(method(self))
+
+    update_wrapper(wrapper, method, ["__name__", "__doc__"])
+    if hasattr(method, "_nltk_compat_7bit"):
+        wrapper._nltk_compat_7bit = method._nltk_compat_7bit
+
+    wrapper._nltk_compat_transliterated = True
+    return wrapper
+
+
+def _7bit(method):
+    def wrapper(self):
+        return method(self).encode('ascii', 'backslashreplace')
+
+    update_wrapper(wrapper, method, ["__name__", "__doc__"])
+
+    if hasattr(method, "_nltk_compat_transliterated"):
+        wrapper._nltk_compat_transliterated = method._nltk_compat_transliterated
+
+    wrapper._nltk_compat_7bit = True
+    return wrapper
+
+
+def _was_fixed(method):
+    return getattr(method, "_nltk_compat_7bit", False) or getattr(
+        method, "_nltk_compat_transliterated", False
+    )
+
+
+class Fraction(fractions.Fraction):
+    """
+    This is a simplified backwards compatible version of fractions.Fraction
+    from Python >=3.5. It adds the `_normalize` parameter such that it does
+    not normalize the denominator to the Greatest Common Divisor (gcd) when
+    the numerator is 0.
+
+    This is most probably only used by the nltk.translate.bleu_score.py where
+    numerator and denominator of the different ngram precisions are mutable.
+    But the idea of "mutable" fraction might not be applicable to other usages,
+    See http://stackoverflow.com/questions/34561265
+
+    This objects should be deprecated once NLTK stops supporting Python < 3.5
+    See https://github.com/nltk/nltk/issues/1330
+    """
+
+    def __new__(cls, numerator=0, denominator=None, _normalize=True):
+        cls = super(Fraction, cls).__new__(cls, numerator, denominator)
+        # To emulate fraction.Fraction.from_float across Python >=2.7,
+        # check that numerator is an integer and denominator is not None.
+        if not _normalize and type(numerator) == int and denominator:
+            cls._numerator = numerator
+            cls._denominator = denominator
+        return cls
index b305c95..89a15eb 100644 (file)
@@ -1,6 +1,6 @@
 # Natural Language Toolkit: Corpus Readers
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Edward Loper <edloper@gmail.com>
 # URL: <http://nltk.org/>
 # For license information, see LICENSE.TXT
@@ -66,359 +66,359 @@ from nltk.corpus.util import LazyCorpusLoader
 from nltk.corpus.reader import *
 
 abc = LazyCorpusLoader(
-    "abc",
+    'abc',
     PlaintextCorpusReader,
-    r"(?!\.).*\.txt",
-    encoding=[("science", "latin_1"), ("rural", "utf8")],
+    r'(?!\.).*\.txt',
+    encoding=[('science', 'latin_1'), ('rural', 'utf8')],
 )
-alpino = LazyCorpusLoader("alpino", AlpinoCorpusReader, tagset="alpino")
+alpino = LazyCorpusLoader('alpino', AlpinoCorpusReader, tagset='alpino')
 brown = LazyCorpusLoader(
-    "brown",
+    'brown',
     CategorizedTaggedCorpusReader,
-    r"c[a-z]\d\d",
-    cat_file="cats.txt",
-    tagset="brown",
+    r'c[a-z]\d\d',
+    cat_file='cats.txt',
+    tagset='brown',
     encoding="ascii",
 )
 cess_cat = LazyCorpusLoader(
-    "cess_cat",
+    'cess_cat',
     BracketParseCorpusReader,
-    r"(?!\.).*\.tbf",
-    tagset="unknown",
-    encoding="ISO-8859-15",
+    r'(?!\.).*\.tbf',
+    tagset='unknown',
+    encoding='ISO-8859-15',
 )
 cess_esp = LazyCorpusLoader(
-    "cess_esp",
+    'cess_esp',
     BracketParseCorpusReader,
-    r"(?!\.).*\.tbf",
-    tagset="unknown",
-    encoding="ISO-8859-15",
+    r'(?!\.).*\.tbf',
+    tagset='unknown',
+    encoding='ISO-8859-15',
 )
-cmudict = LazyCorpusLoader("cmudict", CMUDictCorpusReader, ["cmudict"])
-comtrans = LazyCorpusLoader("comtrans", AlignedCorpusReader, r"(?!\.).*\.txt")
+cmudict = LazyCorpusLoader('cmudict', CMUDictCorpusReader, ['cmudict'])
+comtrans = LazyCorpusLoader('comtrans', AlignedCorpusReader, r'(?!\.).*\.txt')
 comparative_sentences = LazyCorpusLoader(
-    "comparative_sentences",
+    'comparative_sentences',
     ComparativeSentencesCorpusReader,
-    r"labeledSentences\.txt",
-    encoding="latin-1",
+    r'labeledSentences\.txt',
+    encoding='latin-1',
 )
 conll2000 = LazyCorpusLoader(
-    "conll2000",
+    'conll2000',
     ConllChunkCorpusReader,
-    ["train.txt", "test.txt"],
-    ("NP", "VP", "PP"),
-    tagset="wsj",
-    encoding="ascii",
+    ['train.txt', 'test.txt'],
+    ('NP', 'VP', 'PP'),
+    tagset='wsj',
+    encoding='ascii',
 )
 conll2002 = LazyCorpusLoader(
-    "conll2002",
+    'conll2002',
     ConllChunkCorpusReader,
-    ".*\.(test|train).*",
-    ("LOC", "PER", "ORG", "MISC"),
-    encoding="utf-8",
+    '.*\.(test|train).*',
+    ('LOC', 'PER', 'ORG', 'MISC'),
+    encoding='utf-8',
 )
 conll2007 = LazyCorpusLoader(
-    "conll2007",
+    'conll2007',
     DependencyCorpusReader,
-    ".*\.(test|train).*",
-    encoding=[("eus", "ISO-8859-2"), ("esp", "utf8")],
+    '.*\.(test|train).*',
+    encoding=[('eus', 'ISO-8859-2'), ('esp', 'utf8')],
 )
-crubadan = LazyCorpusLoader("crubadan", CrubadanCorpusReader, ".*\.txt")
+crubadan = LazyCorpusLoader('crubadan', CrubadanCorpusReader, '.*\.txt')
 dependency_treebank = LazyCorpusLoader(
-    "dependency_treebank", DependencyCorpusReader, ".*\.dp", encoding="ascii"
+    'dependency_treebank', DependencyCorpusReader, '.*\.dp', encoding='ascii'
 )
 floresta = LazyCorpusLoader(
-    "floresta",
+    'floresta',
     BracketParseCorpusReader,
-    r"(?!\.).*\.ptb",
-    "#",
-    tagset="unknown",
-    encoding="ISO-8859-15",
+    r'(?!\.).*\.ptb',
+    '#',
+    tagset='unknown',
+    encoding='ISO-8859-15',
 )
 framenet15 = LazyCorpusLoader(
-    "framenet_v15",
+    'framenet_v15',
     FramenetCorpusReader,
     [
-        "frRelation.xml",
-        "frameIndex.xml",
-        "fulltextIndex.xml",
-        "luIndex.xml",
-        "semTypes.xml",
+        'frRelation.xml',
+        'frameIndex.xml',
+        'fulltextIndex.xml',
+        'luIndex.xml',
+        'semTypes.xml',
     ],
 )
 framenet = LazyCorpusLoader(
-    "framenet_v17",
+    'framenet_v17',
     FramenetCorpusReader,
     [
-        "frRelation.xml",
-        "frameIndex.xml",
-        "fulltextIndex.xml",
-        "luIndex.xml",
-        "semTypes.xml",
+        'frRelation.xml',
+        'frameIndex.xml',
+        'fulltextIndex.xml',
+        'luIndex.xml',
+        'semTypes.xml',
     ],
 )
 gazetteers = LazyCorpusLoader(
-    "gazetteers", WordListCorpusReader, r"(?!LICENSE|\.).*\.txt", encoding="ISO-8859-2"
+    'gazetteers', WordListCorpusReader, r'(?!LICENSE|\.).*\.txt', encoding='ISO-8859-2'
 )
 genesis = LazyCorpusLoader(
-    "genesis",
+    'genesis',
     PlaintextCorpusReader,
-    r"(?!\.).*\.txt",
+    r'(?!\.).*\.txt',
     encoding=[
-        ("finnish|french|german", "latin_1"),
-        ("swedish", "cp865"),
-        (".*", "utf_8"),
+        ('finnish|french|german', 'latin_1'),
+        ('swedish', 'cp865'),
+        ('.*', 'utf_8'),
     ],
 )
 gutenberg = LazyCorpusLoader(
-    "gutenberg", PlaintextCorpusReader, r"(?!\.).*\.txt", encoding="latin1"
+    'gutenberg', PlaintextCorpusReader, r'(?!\.).*\.txt', encoding='latin1'
 )
-ieer = LazyCorpusLoader("ieer", IEERCorpusReader, r"(?!README|\.).*")
+ieer = LazyCorpusLoader('ieer', IEERCorpusReader, r'(?!README|\.).*')
 inaugural = LazyCorpusLoader(
-    "inaugural", PlaintextCorpusReader, r"(?!\.).*\.txt", encoding="latin1"
+    'inaugural', PlaintextCorpusReader, r'(?!\.).*\.txt', encoding='latin1'
 )
 # [XX] This should probably just use TaggedCorpusReader:
 indian = LazyCorpusLoader(
-    "indian", IndianCorpusReader, r"(?!\.).*\.pos", tagset="unknown", encoding="utf8"
+    'indian', IndianCorpusReader, r'(?!\.).*\.pos', tagset='unknown', encoding='utf8'
 )
 
-jeita = LazyCorpusLoader("jeita", ChasenCorpusReader, r".*\.chasen", encoding="utf-8")
-knbc = LazyCorpusLoader("knbc/corpus1", KNBCorpusReader, r".*/KN.*", encoding="euc-jp")
-lin_thesaurus = LazyCorpusLoader("lin_thesaurus", LinThesaurusCorpusReader, r".*\.lsp")
+jeita = LazyCorpusLoader('jeita', ChasenCorpusReader, r'.*\.chasen', encoding='utf-8')
+knbc = LazyCorpusLoader('knbc/corpus1', KNBCorpusReader, r'.*/KN.*', encoding='euc-jp')
+lin_thesaurus = LazyCorpusLoader('lin_thesaurus', LinThesaurusCorpusReader, r'.*\.lsp')
 mac_morpho = LazyCorpusLoader(
-    "mac_morpho",
+    'mac_morpho',
     MacMorphoCorpusReader,
-    r"(?!\.).*\.txt",
-    tagset="unknown",
-    encoding="latin-1",
+    r'(?!\.).*\.txt',
+    tagset='unknown',
+    encoding='latin-1',
 )
 machado = LazyCorpusLoader(
-    "machado",
+    'machado',
     PortugueseCategorizedPlaintextCorpusReader,
-    r"(?!\.).*\.txt",
-    cat_pattern=r"([a-z]*)/.*",
-    encoding="latin-1",
+    r'(?!\.).*\.txt',
+    cat_pattern=r'([a-z]*)/.*',
+    encoding='latin-1',
 )
 masc_tagged = LazyCorpusLoader(
-    "masc_tagged",
+    'masc_tagged',
     CategorizedTaggedCorpusReader,
-    r"(spoken|written)/.*\.txt",
-    cat_file="categories.txt",
-    tagset="wsj",
+    r'(spoken|written)/.*\.txt',
+    cat_file='categories.txt',
+    tagset='wsj',
     encoding="utf-8",
     sep="_",
 )
 movie_reviews = LazyCorpusLoader(
-    "movie_reviews",
+    'movie_reviews',
     CategorizedPlaintextCorpusReader,
-    r"(?!\.).*\.txt",
-    cat_pattern=r"(neg|pos)/.*",
-    encoding="ascii",
+    r'(?!\.).*\.txt',
+    cat_pattern=r'(neg|pos)/.*',
+    encoding='ascii',
 )
 multext_east = LazyCorpusLoader(
-    "mte_teip5", MTECorpusReader, r"(oana).*\.xml", encoding="utf-8"
+    'mte_teip5', MTECorpusReader, r'(oana).*\.xml', encoding="utf-8"
 )
 names = LazyCorpusLoader(
-    "names", WordListCorpusReader, r"(?!\.).*\.txt", encoding="ascii"
+    'names', WordListCorpusReader, r'(?!\.).*\.txt', encoding='ascii'
 )
 nps_chat = LazyCorpusLoader(
-    "nps_chat", NPSChatCorpusReader, r"(?!README|\.).*\.xml", tagset="wsj"
+    'nps_chat', NPSChatCorpusReader, r'(?!README|\.).*\.xml', tagset='wsj'
 )
 opinion_lexicon = LazyCorpusLoader(
-    "opinion_lexicon",
+    'opinion_lexicon',
     OpinionLexiconCorpusReader,
-    r"(\w+)\-words\.txt",
-    encoding="ISO-8859-2",
+    r'(\w+)\-words\.txt',
+    encoding='ISO-8859-2',
 )
 ppattach = LazyCorpusLoader(
-    "ppattach", PPAttachmentCorpusReader, ["training", "test", "devset"]
+    'ppattach', PPAttachmentCorpusReader, ['training', 'test', 'devset']
 )
 product_reviews_1 = LazyCorpusLoader(
-    "product_reviews_1", ReviewsCorpusReader, r"^(?!Readme).*\.txt", encoding="utf8"
+    'product_reviews_1', ReviewsCorpusReader, r'^(?!Readme).*\.txt', encoding='utf8'
 )
 product_reviews_2 = LazyCorpusLoader(
-    "product_reviews_2", ReviewsCorpusReader, r"^(?!Readme).*\.txt", encoding="utf8"
+    'product_reviews_2', ReviewsCorpusReader, r'^(?!Readme).*\.txt', encoding='utf8'
 )
 pros_cons = LazyCorpusLoader(
-    "pros_cons",
+    'pros_cons',
     ProsConsCorpusReader,
-    r"Integrated(Cons|Pros)\.txt",
-    cat_pattern=r"Integrated(Cons|Pros)\.txt",
-    encoding="ISO-8859-2",
+    r'Integrated(Cons|Pros)\.txt',
+    cat_pattern=r'Integrated(Cons|Pros)\.txt',
+    encoding='ISO-8859-2',
 )
 ptb = LazyCorpusLoader(  # Penn Treebank v3: WSJ and Brown portions
-    "ptb",
+    'ptb',
     CategorizedBracketParseCorpusReader,
-    r"(WSJ/\d\d/WSJ_\d\d|BROWN/C[A-Z]/C[A-Z])\d\d.MRG",
-    cat_file="allcats.txt",
-    tagset="wsj",
+    r'(WSJ/\d\d/WSJ_\d\d|BROWN/C[A-Z]/C[A-Z])\d\d.MRG',
+    cat_file='allcats.txt',
+    tagset='wsj',
 )
 qc = LazyCorpusLoader(
-    "qc", StringCategoryCorpusReader, ["train.txt", "test.txt"], encoding="ISO-8859-2"
+    'qc', StringCategoryCorpusReader, ['train.txt', 'test.txt'], encoding='ISO-8859-2'
 )
 reuters = LazyCorpusLoader(
-    "reuters",
+    'reuters',
     CategorizedPlaintextCorpusReader,
-    "(training|test).*",
-    cat_file="cats.txt",
-    encoding="ISO-8859-2",
+    '(training|test).*',
+    cat_file='cats.txt',
+    encoding='ISO-8859-2',
 )
-rte = LazyCorpusLoader("rte", RTECorpusReader, r"(?!\.).*\.xml")
-senseval = LazyCorpusLoader("senseval", SensevalCorpusReader, r"(?!\.).*\.pos")
+rte = LazyCorpusLoader('rte', RTECorpusReader, r'(?!\.).*\.xml')
+senseval = LazyCorpusLoader('senseval', SensevalCorpusReader, r'(?!\.).*\.pos')
 sentence_polarity = LazyCorpusLoader(
-    "sentence_polarity",
+    'sentence_polarity',
     CategorizedSentencesCorpusReader,
-    r"rt-polarity\.(neg|pos)",
-    cat_pattern=r"rt-polarity\.(neg|pos)",
-    encoding="utf-8",
+    r'rt-polarity\.(neg|pos)',
+    cat_pattern=r'rt-polarity\.(neg|pos)',
+    encoding='utf-8',
 )
 sentiwordnet = LazyCorpusLoader(
-    "sentiwordnet", SentiWordNetCorpusReader, "SentiWordNet_3.0.0.txt", encoding="utf-8"
+    'sentiwordnet', SentiWordNetCorpusReader, 'SentiWordNet_3.0.0.txt', encoding='utf-8'
 )
-shakespeare = LazyCorpusLoader("shakespeare", XMLCorpusReader, r"(?!\.).*\.xml")
+shakespeare = LazyCorpusLoader('shakespeare', XMLCorpusReader, r'(?!\.).*\.xml')
 sinica_treebank = LazyCorpusLoader(
-    "sinica_treebank",
+    'sinica_treebank',
     SinicaTreebankCorpusReader,
-    ["parsed"],
-    tagset="unknown",
-    encoding="utf-8",
+    ['parsed'],
+    tagset='unknown',
+    encoding='utf-8',
 )
 state_union = LazyCorpusLoader(
-    "state_union", PlaintextCorpusReader, r"(?!\.).*\.txt", encoding="ISO-8859-2"
+    'state_union', PlaintextCorpusReader, r'(?!\.).*\.txt', encoding='ISO-8859-2'
 )
 stopwords = LazyCorpusLoader(
-    "stopwords", WordListCorpusReader, r"(?!README|\.).*", encoding="utf8"
+    'stopwords', WordListCorpusReader, r'(?!README|\.).*', encoding='utf8'
 )
 subjectivity = LazyCorpusLoader(
-    "subjectivity",
+    'subjectivity',
     CategorizedSentencesCorpusReader,
-    r"(quote.tok.gt9|plot.tok.gt9)\.5000",
-    cat_map={"quote.tok.gt9.5000": ["subj"], "plot.tok.gt9.5000": ["obj"]},
-    encoding="latin-1",
+    r'(quote.tok.gt9|plot.tok.gt9)\.5000',
+    cat_map={'quote.tok.gt9.5000': ['subj'], 'plot.tok.gt9.5000': ['obj']},
+    encoding='latin-1',
 )
 swadesh = LazyCorpusLoader(
-    "swadesh", SwadeshCorpusReader, r"(?!README|\.).*", encoding="utf8"
+    'swadesh', SwadeshCorpusReader, r'(?!README|\.).*', encoding='utf8'
 )
 swadesh110 = LazyCorpusLoader(
-    'panlex_swadesh', PanlexSwadeshCorpusReader, r'swadesh110/.*\.txt', encoding='utf8'
+    'panlex_swadesh', SwadeshCorpusReader, r'swadesh110/.*\.txt', encoding='utf8'
 )
 swadesh207 = LazyCorpusLoader(
-    'panlex_swadesh', PanlexSwadeshCorpusReader, r'swadesh207/.*\.txt', encoding='utf8'
+    'panlex_swadesh', SwadeshCorpusReader, r'swadesh207/.*\.txt', encoding='utf8'
 )
-switchboard = LazyCorpusLoader("switchboard", SwitchboardCorpusReader, tagset="wsj")
-timit = LazyCorpusLoader("timit", TimitCorpusReader)
+switchboard = LazyCorpusLoader('switchboard', SwitchboardCorpusReader, tagset='wsj')
+timit = LazyCorpusLoader('timit', TimitCorpusReader)
 timit_tagged = LazyCorpusLoader(
-    "timit", TimitTaggedCorpusReader, ".+\.tags", tagset="wsj", encoding="ascii"
+    'timit', TimitTaggedCorpusReader, '.+\.tags', tagset='wsj', encoding='ascii'
 )
 toolbox = LazyCorpusLoader(
-    "toolbox", ToolboxCorpusReader, r"(?!.*(README|\.)).*\.(dic|txt)"
+    'toolbox', ToolboxCorpusReader, r'(?!.*(README|\.)).*\.(dic|txt)'
 )
 treebank = LazyCorpusLoader(
-    "treebank/combined",
+    'treebank/combined',
     BracketParseCorpusReader,
-    r"wsj_.*\.mrg",
-    tagset="wsj",
-    encoding="ascii",
+    r'wsj_.*\.mrg',
+    tagset='wsj',
+    encoding='ascii',
 )
 treebank_chunk = LazyCorpusLoader(
-    "treebank/tagged",
+    'treebank/tagged',
     ChunkedCorpusReader,
-    r"wsj_.*\.pos",
-    sent_tokenizer=RegexpTokenizer(r"(?<=/\.)\s*(?![^\[]*\])", gaps=True),
+    r'wsj_.*\.pos',
+    sent_tokenizer=RegexpTokenizer(r'(?<=/\.)\s*(?![^\[]*\])', gaps=True),
     para_block_reader=tagged_treebank_para_block_reader,
-    tagset="wsj",
-    encoding="ascii",
+    tagset='wsj',
+    encoding='ascii',
 )
 treebank_raw = LazyCorpusLoader(
-    "treebank/raw", PlaintextCorpusReader, r"wsj_.*", encoding="ISO-8859-2"
+    'treebank/raw', PlaintextCorpusReader, r'wsj_.*', encoding='ISO-8859-2'
 )
-twitter_samples = LazyCorpusLoader("twitter_samples", TwitterCorpusReader, ".*\.json")
-udhr = LazyCorpusLoader("udhr", UdhrCorpusReader)
-udhr2 = LazyCorpusLoader("udhr2", PlaintextCorpusReader, r".*\.txt", encoding="utf8")
+twitter_samples = LazyCorpusLoader('twitter_samples', TwitterCorpusReader, '.*\.json')
+udhr = LazyCorpusLoader('udhr', UdhrCorpusReader)
+udhr2 = LazyCorpusLoader('udhr2', PlaintextCorpusReader, r'.*\.txt', encoding='utf8')
 universal_treebanks = LazyCorpusLoader(
-    "universal_treebanks_v20",
+    'universal_treebanks_v20',
     ConllCorpusReader,
-    r".*\.conll",
+    r'.*\.conll',
     columntypes=(
-        "ignore",
-        "words",
-        "ignore",
-        "ignore",
-        "pos",
-        "ignore",
-        "ignore",
-        "ignore",
-        "ignore",
-        "ignore",
+        'ignore',
+        'words',
+        'ignore',
+        'ignore',
+        'pos',
+        'ignore',
+        'ignore',
+        'ignore',
+        'ignore',
+        'ignore',
     ),
 )
-verbnet = LazyCorpusLoader("verbnet", VerbnetCorpusReader, r"(?!\.).*\.xml")
+verbnet = LazyCorpusLoader('verbnet', VerbnetCorpusReader, r'(?!\.).*\.xml')
 webtext = LazyCorpusLoader(
-    "webtext", PlaintextCorpusReader, r"(?!README|\.).*\.txt", encoding="ISO-8859-2"
+    'webtext', PlaintextCorpusReader, r'(?!README|\.).*\.txt', encoding='ISO-8859-2'
 )
 wordnet = LazyCorpusLoader(
-    "wordnet",
+    'wordnet',
     WordNetCorpusReader,
-    LazyCorpusLoader("omw", CorpusReader, r".*/wn-data-.*\.tab", encoding="utf8"),
+    LazyCorpusLoader('omw', CorpusReader, r'.*/wn-data-.*\.tab', encoding='utf8'),
 )
-wordnet_ic = LazyCorpusLoader("wordnet_ic", WordNetICCorpusReader, ".*\.dat")
+wordnet_ic = LazyCorpusLoader('wordnet_ic', WordNetICCorpusReader, '.*\.dat')
 words = LazyCorpusLoader(
-    "words", WordListCorpusReader, r"(?!README|\.).*", encoding="ascii"
+    'words', WordListCorpusReader, r'(?!README|\.).*', encoding='ascii'
 )
 
 # defined after treebank
 propbank = LazyCorpusLoader(
-    "propbank",
+    'propbank',
     PropbankCorpusReader,
-    "prop.txt",
-    "frames/.*\.xml",
-    "verbs.txt",
-    lambda filename: re.sub(r"^wsj/\d\d/", "", filename),
+    'prop.txt',
+    'frames/.*\.xml',
+    'verbs.txt',
+    lambda filename: re.sub(r'^wsj/\d\d/', '', filename),
     treebank,
 )  # Must be defined *after* treebank corpus.
 nombank = LazyCorpusLoader(
-    "nombank.1.0",
+    'nombank.1.0',
     NombankCorpusReader,
-    "nombank.1.0",
-    "frames/.*\.xml",
-    "nombank.1.0.words",
-    lambda filename: re.sub(r"^wsj/\d\d/", "", filename),
+    'nombank.1.0',
+    'frames/.*\.xml',
+    'nombank.1.0.words',
+    lambda filename: re.sub(r'^wsj/\d\d/', '', filename),
     treebank,
 )  # Must be defined *after* treebank corpus.
 propbank_ptb = LazyCorpusLoader(
-    "propbank",
+    'propbank',
     PropbankCorpusReader,
-    "prop.txt",
-    "frames/.*\.xml",
-    "verbs.txt",
+    'prop.txt',
+    'frames/.*\.xml',
+    'verbs.txt',
     lambda filename: filename.upper(),
     ptb,
 )  # Must be defined *after* ptb corpus.
 nombank_ptb = LazyCorpusLoader(
-    "nombank.1.0",
+    'nombank.1.0',
     NombankCorpusReader,
-    "nombank.1.0",
-    "frames/.*\.xml",
-    "nombank.1.0.words",
+    'nombank.1.0',
+    'frames/.*\.xml',
+    'nombank.1.0.words',
     lambda filename: filename.upper(),
     ptb,
 )  # Must be defined *after* ptb corpus.
 semcor = LazyCorpusLoader(
-    "semcor", SemcorCorpusReader, r"brown./tagfiles/br-.*\.xml", wordnet
+    'semcor', SemcorCorpusReader, r'brown./tagfiles/br-.*\.xml', wordnet
 )  # Must be defined *after* wordnet corpus.
 
 nonbreaking_prefixes = LazyCorpusLoader(
-    "nonbreaking_prefixes",
+    'nonbreaking_prefixes',
     NonbreakingPrefixesCorpusReader,
-    r"(?!README|\.).*",
-    encoding="utf8",
+    r'(?!README|\.).*',
+    encoding='utf8',
 )
 perluniprops = LazyCorpusLoader(
-    "perluniprops",
+    'perluniprops',
     UnicharsCorpusReader,
-    r"(?!README|\.).*",
-    nltk_data_subdir="misc",
-    encoding="utf8",
+    r'(?!README|\.).*',
+    nltk_data_subdir='misc',
+    encoding='utf8',
 )
 
 # mwa_ppdb = LazyCorpusLoader(
@@ -478,7 +478,7 @@ def demo():
 
 #    ycoe.demo()
 
-if __name__ == "__main__":
+if __name__ == '__main__':
     # demo()
     pass
 
@@ -489,5 +489,5 @@ def teardown_module(module=None):
 
     for name in dir(nltk.corpus):
         obj = getattr(nltk.corpus, name, None)
-        if isinstance(obj, CorpusReader) and hasattr(obj, "_unload"):
+        if isinstance(obj, CorpusReader) and hasattr(obj, '_unload'):
             obj._unload()
index 414d9b7..2176b91 100644 (file)
Binary files a/nlp_resource_data/nltk/corpus/__pycache__/__init__.cpython-37.pyc and b/nlp_resource_data/nltk/corpus/__pycache__/__init__.cpython-37.pyc differ
index 01cbbb4..350933d 100644 (file)
Binary files a/nlp_resource_data/nltk/corpus/__pycache__/europarl_raw.cpython-37.pyc and b/nlp_resource_data/nltk/corpus/__pycache__/europarl_raw.cpython-37.pyc differ
index 3a0893f..85df09d 100644 (file)
Binary files a/nlp_resource_data/nltk/corpus/__pycache__/util.cpython-37.pyc and b/nlp_resource_data/nltk/corpus/__pycache__/util.cpython-37.pyc differ
index a4caa7b..b03011c 100644 (file)
@@ -1,6 +1,6 @@
 # Natural Language Toolkit: Europarl Corpus Readers
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author:  Nitin Madnani <nmadnani@umiacs.umd.edu>
 # URL: <http://nltk.org/>
 # For license information, see LICENSE.TXT
@@ -11,45 +11,45 @@ from nltk.corpus.reader import *
 
 # Create a new corpus reader instance for each European language
 danish = LazyCorpusLoader(
-    "europarl_raw/danish", EuroparlCorpusReader, r"ep-.*\.da", encoding="utf-8"
+    'europarl_raw/danish', EuroparlCorpusReader, r'ep-.*\.da', encoding='utf-8'
 )
 
 dutch = LazyCorpusLoader(
-    "europarl_raw/dutch", EuroparlCorpusReader, r"ep-.*\.nl", encoding="utf-8"
+    'europarl_raw/dutch', EuroparlCorpusReader, r'ep-.*\.nl', encoding='utf-8'
 )
 
 english = LazyCorpusLoader(
-    "europarl_raw/english", EuroparlCorpusReader, r"ep-.*\.en", encoding="utf-8"
+    'europarl_raw/english', EuroparlCorpusReader, r'ep-.*\.en', encoding='utf-8'
 )
 
 finnish = LazyCorpusLoader(
-    "europarl_raw/finnish", EuroparlCorpusReader, r"ep-.*\.fi", encoding="utf-8"
+    'europarl_raw/finnish', EuroparlCorpusReader, r'ep-.*\.fi', encoding='utf-8'
 )
 
 french = LazyCorpusLoader(
-    "europarl_raw/french", EuroparlCorpusReader, r"ep-.*\.fr", encoding="utf-8"
+    'europarl_raw/french', EuroparlCorpusReader, r'ep-.*\.fr', encoding='utf-8'
 )
 
 german = LazyCorpusLoader(
-    "europarl_raw/german", EuroparlCorpusReader, r"ep-.*\.de", encoding="utf-8"
+    'europarl_raw/german', EuroparlCorpusReader, r'ep-.*\.de', encoding='utf-8'
 )
 
 greek = LazyCorpusLoader(
-    "europarl_raw/greek", EuroparlCorpusReader, r"ep-.*\.el", encoding="utf-8"
+    'europarl_raw/greek', EuroparlCorpusReader, r'ep-.*\.el', encoding='utf-8'
 )
 
 italian = LazyCorpusLoader(
-    "europarl_raw/italian", EuroparlCorpusReader, r"ep-.*\.it", encoding="utf-8"
+    'europarl_raw/italian', EuroparlCorpusReader, r'ep-.*\.it', encoding='utf-8'
 )
 
 portuguese = LazyCorpusLoader(
-    "europarl_raw/portuguese", EuroparlCorpusReader, r"ep-.*\.pt", encoding="utf-8"
+    'europarl_raw/portuguese', EuroparlCorpusReader, r'ep-.*\.pt', encoding='utf-8'
 )
 
 spanish = LazyCorpusLoader(
-    "europarl_raw/spanish", EuroparlCorpusReader, r"ep-.*\.es", encoding="utf-8"
+    'europarl_raw/spanish', EuroparlCorpusReader, r'ep-.*\.es', encoding='utf-8'
 )
 
 swedish = LazyCorpusLoader(
-    "europarl_raw/swedish", EuroparlCorpusReader, r"ep-.*\.sv", encoding="utf-8"
+    'europarl_raw/swedish', EuroparlCorpusReader, r'ep-.*\.sv', encoding='utf-8'
 )
index a1db6d4..19c1515 100644 (file)
@@ -1,6 +1,6 @@
 # Natural Language Toolkit: Corpus Readers
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Steven Bird <stevenbird1@gmail.com>
 #         Edward Loper <edloper@gmail.com>
 # URL: <http://nltk.org/>
@@ -103,7 +103,6 @@ from nltk.corpus.reader.pros_cons import *
 from nltk.corpus.reader.categorized_sents import *
 from nltk.corpus.reader.comparative_sents import *
 from nltk.corpus.reader.panlex_lite import *
-from nltk.corpus.reader.panlex_swadesh import *
 
 # Make sure that nltk.corpus.reader.bracket_parse gives the module, not
 # the function bracket_parse() defined in nltk.tree:
@@ -179,5 +178,4 @@ __all__ = [
     'NonbreakingPrefixesCorpusReader',
     'UnicharsCorpusReader',
     'MWAPPDBCorpusReader',
-    'PanlexSwadeshCorpusReader',
 ]
index dfc3584..fe100f7 100644 (file)
Binary files a/nlp_resource_data/nltk/corpus/reader/__pycache__/__init__.cpython-37.pyc and b/nlp_resource_data/nltk/corpus/reader/__pycache__/__init__.cpython-37.pyc differ
index 6012d40..2566ddb 100644 (file)
Binary files a/nlp_resource_data/nltk/corpus/reader/__pycache__/aligned.cpython-37.pyc and b/nlp_resource_data/nltk/corpus/reader/__pycache__/aligned.cpython-37.pyc differ
index b576aec..77daf69 100644 (file)
Binary files a/nlp_resource_data/nltk/corpus/reader/__pycache__/api.cpython-37.pyc and b/nlp_resource_data/nltk/corpus/reader/__pycache__/api.cpython-37.pyc differ
index 56e8750..9622ca5 100644 (file)
Binary files a/nlp_resource_data/nltk/corpus/reader/__pycache__/bnc.cpython-37.pyc and b/nlp_resource_data/nltk/corpus/reader/__pycache__/bnc.cpython-37.pyc differ
index 45e4401..23deba1 100644 (file)
Binary files a/nlp_resource_data/nltk/corpus/reader/__pycache__/bracket_parse.cpython-37.pyc and b/nlp_resource_data/nltk/corpus/reader/__pycache__/bracket_parse.cpython-37.pyc differ
index 93e0803..d1a70ea 100644 (file)
Binary files a/nlp_resource_data/nltk/corpus/reader/__pycache__/categorized_sents.cpython-37.pyc and b/nlp_resource_data/nltk/corpus/reader/__pycache__/categorized_sents.cpython-37.pyc differ
index 3e85205..09097c0 100644 (file)
Binary files a/nlp_resource_data/nltk/corpus/reader/__pycache__/chasen.cpython-37.pyc and b/nlp_resource_data/nltk/corpus/reader/__pycache__/chasen.cpython-37.pyc differ
index 4f05bc6..cb6f087 100644 (file)
Binary files a/nlp_resource_data/nltk/corpus/reader/__pycache__/childes.cpython-37.pyc and b/nlp_resource_data/nltk/corpus/reader/__pycache__/childes.cpython-37.pyc differ
index e2dc4e3..db1e0cb 100644 (file)
Binary files a/nlp_resource_data/nltk/corpus/reader/__pycache__/chunked.cpython-37.pyc and b/nlp_resource_data/nltk/corpus/reader/__pycache__/chunked.cpython-37.pyc differ
index 02fec50..6f49b2d 100644 (file)
Binary files a/nlp_resource_data/nltk/corpus/reader/__pycache__/cmudict.cpython-37.pyc and b/nlp_resource_data/nltk/corpus/reader/__pycache__/cmudict.cpython-37.pyc differ
index 8a041c1..6253b34 100644 (file)
Binary files a/nlp_resource_data/nltk/corpus/reader/__pycache__/comparative_sents.cpython-37.pyc and b/nlp_resource_data/nltk/corpus/reader/__pycache__/comparative_sents.cpython-37.pyc differ
index 481bebb..83ad760 100644 (file)
Binary files a/nlp_resource_data/nltk/corpus/reader/__pycache__/conll.cpython-37.pyc and b/nlp_resource_data/nltk/corpus/reader/__pycache__/conll.cpython-37.pyc differ
index e9f3a3e..6dd1179 100644 (file)
Binary files a/nlp_resource_data/nltk/corpus/reader/__pycache__/crubadan.cpython-37.pyc and b/nlp_resource_data/nltk/corpus/reader/__pycache__/crubadan.cpython-37.pyc differ
index 9584742..0a0de32 100644 (file)
Binary files a/nlp_resource_data/nltk/corpus/reader/__pycache__/dependency.cpython-37.pyc and b/nlp_resource_data/nltk/corpus/reader/__pycache__/dependency.cpython-37.pyc differ
index ea6b5ae..672c315 100644 (file)
Binary files a/nlp_resource_data/nltk/corpus/reader/__pycache__/framenet.cpython-37.pyc and b/nlp_resource_data/nltk/corpus/reader/__pycache__/framenet.cpython-37.pyc differ
index ef6c803..1a3ae81 100644 (file)
Binary files a/nlp_resource_data/nltk/corpus/reader/__pycache__/ieer.cpython-37.pyc and b/nlp_resource_data/nltk/corpus/reader/__pycache__/ieer.cpython-37.pyc differ
index 16ce5c3..c56e219 100644 (file)
Binary files a/nlp_resource_data/nltk/corpus/reader/__pycache__/indian.cpython-37.pyc and b/nlp_resource_data/nltk/corpus/reader/__pycache__/indian.cpython-37.pyc differ
index 3e6ca29..bbe2b41 100644 (file)
Binary files a/nlp_resource_data/nltk/corpus/reader/__pycache__/ipipan.cpython-37.pyc and b/nlp_resource_data/nltk/corpus/reader/__pycache__/ipipan.cpython-37.pyc differ
index 0d5c4bb..cf12a67 100644 (file)
Binary files a/nlp_resource_data/nltk/corpus/reader/__pycache__/knbc.cpython-37.pyc and b/nlp_resource_data/nltk/corpus/reader/__pycache__/knbc.cpython-37.pyc differ
index f720098..1edd967 100644 (file)
Binary files a/nlp_resource_data/nltk/corpus/reader/__pycache__/lin.cpython-37.pyc and b/nlp_resource_data/nltk/corpus/reader/__pycache__/lin.cpython-37.pyc differ
index eb277f4..f2ae237 100644 (file)
Binary files a/nlp_resource_data/nltk/corpus/reader/__pycache__/mte.cpython-37.pyc and b/nlp_resource_data/nltk/corpus/reader/__pycache__/mte.cpython-37.pyc differ
index ee49da0..aca3131 100644 (file)
Binary files a/nlp_resource_data/nltk/corpus/reader/__pycache__/nkjp.cpython-37.pyc and b/nlp_resource_data/nltk/corpus/reader/__pycache__/nkjp.cpython-37.pyc differ
index a6ec885..c542b22 100644 (file)
Binary files a/nlp_resource_data/nltk/corpus/reader/__pycache__/nombank.cpython-37.pyc and b/nlp_resource_data/nltk/corpus/reader/__pycache__/nombank.cpython-37.pyc differ
index 943fbd0..ee006fc 100644 (file)
Binary files a/nlp_resource_data/nltk/corpus/reader/__pycache__/nps_chat.cpython-37.pyc and b/nlp_resource_data/nltk/corpus/reader/__pycache__/nps_chat.cpython-37.pyc differ
index 1454152..c652ad9 100644 (file)
Binary files a/nlp_resource_data/nltk/corpus/reader/__pycache__/opinion_lexicon.cpython-37.pyc and b/nlp_resource_data/nltk/corpus/reader/__pycache__/opinion_lexicon.cpython-37.pyc differ
index acb862f..e09118c 100644 (file)
Binary files a/nlp_resource_data/nltk/corpus/reader/__pycache__/panlex_lite.cpython-37.pyc and b/nlp_resource_data/nltk/corpus/reader/__pycache__/panlex_lite.cpython-37.pyc differ
diff --git a/nlp_resource_data/nltk/corpus/reader/__pycache__/panlex_swadesh.cpython-37.pyc b/nlp_resource_data/nltk/corpus/reader/__pycache__/panlex_swadesh.cpython-37.pyc
deleted file mode 100644 (file)
index 9665b8a..0000000
Binary files a/nlp_resource_data/nltk/corpus/reader/__pycache__/panlex_swadesh.cpython-37.pyc and /dev/null differ
index 6ebaa5a..82cd391 100644 (file)
Binary files a/nlp_resource_data/nltk/corpus/reader/__pycache__/pl196x.cpython-37.pyc and b/nlp_resource_data/nltk/corpus/reader/__pycache__/pl196x.cpython-37.pyc differ
index d68d226..084d1a5 100644 (file)
Binary files a/nlp_resource_data/nltk/corpus/reader/__pycache__/plaintext.cpython-37.pyc and b/nlp_resource_data/nltk/corpus/reader/__pycache__/plaintext.cpython-37.pyc differ
index 10bcd17..7bb7c95 100644 (file)
Binary files a/nlp_resource_data/nltk/corpus/reader/__pycache__/ppattach.cpython-37.pyc and b/nlp_resource_data/nltk/corpus/reader/__pycache__/ppattach.cpython-37.pyc differ
index dc95c18..cace71d 100644 (file)
Binary files a/nlp_resource_data/nltk/corpus/reader/__pycache__/propbank.cpython-37.pyc and b/nlp_resource_data/nltk/corpus/reader/__pycache__/propbank.cpython-37.pyc differ
index 415780d..747083d 100644 (file)
Binary files a/nlp_resource_data/nltk/corpus/reader/__pycache__/pros_cons.cpython-37.pyc and b/nlp_resource_data/nltk/corpus/reader/__pycache__/pros_cons.cpython-37.pyc differ
index 0823ce4..a43ae9c 100644 (file)
Binary files a/nlp_resource_data/nltk/corpus/reader/__pycache__/reviews.cpython-37.pyc and b/nlp_resource_data/nltk/corpus/reader/__pycache__/reviews.cpython-37.pyc differ
index e7277c5..6974a0b 100644 (file)
Binary files a/nlp_resource_data/nltk/corpus/reader/__pycache__/rte.cpython-37.pyc and b/nlp_resource_data/nltk/corpus/reader/__pycache__/rte.cpython-37.pyc differ
index 7cb913f..c316eb4 100644 (file)
Binary files a/nlp_resource_data/nltk/corpus/reader/__pycache__/semcor.cpython-37.pyc and b/nlp_resource_data/nltk/corpus/reader/__pycache__/semcor.cpython-37.pyc differ
index 505a1a6..6661ee1 100644 (file)
Binary files a/nlp_resource_data/nltk/corpus/reader/__pycache__/senseval.cpython-37.pyc and b/nlp_resource_data/nltk/corpus/reader/__pycache__/senseval.cpython-37.pyc differ
index e3593be..38babe2 100644 (file)
Binary files a/nlp_resource_data/nltk/corpus/reader/__pycache__/sentiwordnet.cpython-37.pyc and b/nlp_resource_data/nltk/corpus/reader/__pycache__/sentiwordnet.cpython-37.pyc differ
index 1d59b53..b9b9f4d 100644 (file)
Binary files a/nlp_resource_data/nltk/corpus/reader/__pycache__/sinica_treebank.cpython-37.pyc and b/nlp_resource_data/nltk/corpus/reader/__pycache__/sinica_treebank.cpython-37.pyc differ
index f1f7b05..a686184 100644 (file)
Binary files a/nlp_resource_data/nltk/corpus/reader/__pycache__/string_category.cpython-37.pyc and b/nlp_resource_data/nltk/corpus/reader/__pycache__/string_category.cpython-37.pyc differ
index 7a23927..2719bcf 100644 (file)
Binary files a/nlp_resource_data/nltk/corpus/reader/__pycache__/switchboard.cpython-37.pyc and b/nlp_resource_data/nltk/corpus/reader/__pycache__/switchboard.cpython-37.pyc differ
index f9a64ca..5f3bacd 100644 (file)
Binary files a/nlp_resource_data/nltk/corpus/reader/__pycache__/tagged.cpython-37.pyc and b/nlp_resource_data/nltk/corpus/reader/__pycache__/tagged.cpython-37.pyc differ
index 2132d55..ecacdc3 100644 (file)
Binary files a/nlp_resource_data/nltk/corpus/reader/__pycache__/timit.cpython-37.pyc and b/nlp_resource_data/nltk/corpus/reader/__pycache__/timit.cpython-37.pyc differ
index e194636..8bb1d16 100644 (file)
Binary files a/nlp_resource_data/nltk/corpus/reader/__pycache__/toolbox.cpython-37.pyc and b/nlp_resource_data/nltk/corpus/reader/__pycache__/toolbox.cpython-37.pyc differ
index b2dc425..9145b0f 100644 (file)
Binary files a/nlp_resource_data/nltk/corpus/reader/__pycache__/twitter.cpython-37.pyc and b/nlp_resource_data/nltk/corpus/reader/__pycache__/twitter.cpython-37.pyc differ
index 0e4efeb..c83140b 100644 (file)
Binary files a/nlp_resource_data/nltk/corpus/reader/__pycache__/udhr.cpython-37.pyc and b/nlp_resource_data/nltk/corpus/reader/__pycache__/udhr.cpython-37.pyc differ
index f7acada..4a489bd 100644 (file)
Binary files a/nlp_resource_data/nltk/corpus/reader/__pycache__/util.cpython-37.pyc and b/nlp_resource_data/nltk/corpus/reader/__pycache__/util.cpython-37.pyc differ
index 4089e5d..377354a 100644 (file)
Binary files a/nlp_resource_data/nltk/corpus/reader/__pycache__/verbnet.cpython-37.pyc and b/nlp_resource_data/nltk/corpus/reader/__pycache__/verbnet.cpython-37.pyc differ
index 3cbb84f..5f025b7 100644 (file)
Binary files a/nlp_resource_data/nltk/corpus/reader/__pycache__/wordlist.cpython-37.pyc and b/nlp_resource_data/nltk/corpus/reader/__pycache__/wordlist.cpython-37.pyc differ
index ac95692..649a51d 100644 (file)
Binary files a/nlp_resource_data/nltk/corpus/reader/__pycache__/wordnet.cpython-37.pyc and b/nlp_resource_data/nltk/corpus/reader/__pycache__/wordnet.cpython-37.pyc differ
index 0495bcb..d3c8c94 100644 (file)
Binary files a/nlp_resource_data/nltk/corpus/reader/__pycache__/xmldocs.cpython-37.pyc and b/nlp_resource_data/nltk/corpus/reader/__pycache__/xmldocs.cpython-37.pyc differ
index da94d7d..c210437 100644 (file)
Binary files a/nlp_resource_data/nltk/corpus/reader/__pycache__/ycoe.cpython-37.pyc and b/nlp_resource_data/nltk/corpus/reader/__pycache__/ycoe.cpython-37.pyc differ
index a97fecc..0d8a67a 100644 (file)
@@ -1,10 +1,12 @@
 # Natural Language Toolkit: Aligned Corpus Reader
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # URL: <http://nltk.org/>
 # Author: Steven Bird <stevenbird1@gmail.com>
 # For license information, see LICENSE.TXT
 
+from six import string_types
+
 from nltk.tokenize import WhitespaceTokenizer, RegexpTokenizer
 from nltk.translate import AlignedSent, Alignment
 
@@ -26,11 +28,11 @@ class AlignedCorpusReader(CorpusReader):
         self,
         root,
         fileids,
-        sep="/",
+        sep='/',
         word_tokenizer=WhitespaceTokenizer(),
-        sent_tokenizer=RegexpTokenizer("\n", gaps=True),
+        sent_tokenizer=RegexpTokenizer('\n', gaps=True),
         alignedsent_block_reader=read_alignedsent_block,
-        encoding="latin1",
+        encoding='latin1',
     ):
         """
         Construct a new Aligned Corpus reader for a set of documents
@@ -55,7 +57,7 @@ class AlignedCorpusReader(CorpusReader):
         """
         if fileids is None:
             fileids = self._fileids
-        elif isinstance(fileids, str):
+        elif isinstance(fileids, string_types):
             fileids = [fileids]
         return concat([self.open(f).read() for f in fileids])
 
index 98b3f5e..0b30f5a 100644 (file)
@@ -1,6 +1,6 @@
 # Natural Language Toolkit: API for Corpus Readers
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Steven Bird <stevenbird1@gmail.com>
 #         Edward Loper <edloper@gmail.com>
 # URL: <http://nltk.org/>
@@ -9,17 +9,22 @@
 """
 API for corpus readers.
 """
+from __future__ import unicode_literals
 
 import os
 import re
 from collections import defaultdict
 from itertools import chain
 
+from six import string_types
+
+from nltk import compat
 from nltk.data import PathPointer, FileSystemPathPointer, ZipFilePathPointer
 
 from nltk.corpus.reader.util import *
 
 
+@compat.python_2_unicode_compatible
 class CorpusReader(object):
     """
     A base class for "corpus reader" classes, each of which can be
@@ -39,7 +44,7 @@ class CorpusReader(object):
     be used to select which portion of the corpus should be returned.
     """
 
-    def __init__(self, root, fileids, encoding="utf8", tagset=None):
+    def __init__(self, root, fileids, encoding='utf8', tagset=None):
         """
         :type root: PathPointer or str
         :param root: A path pointer identifying the root directory for
@@ -71,18 +76,18 @@ class CorpusReader(object):
               tagged_...() methods.
         """
         # Convert the root to a path pointer, if necessary.
-        if isinstance(root, str) and not isinstance(root, PathPointer):
-            m = re.match("(.*\.zip)/?(.*)$|", root)
+        if isinstance(root, string_types) and not isinstance(root, PathPointer):
+            m = re.match('(.*\.zip)/?(.*)$|', root)
             zipfile, zipentry = m.groups()
             if zipfile:
                 root = ZipFilePathPointer(zipfile, zipentry)
             else:
                 root = FileSystemPathPointer(root)
         elif not isinstance(root, PathPointer):
-            raise TypeError("CorpusReader: expected a string or a PathPointer")
+            raise TypeError('CorpusReader: expected a string or a PathPointer')
 
         # If `fileids` is a regexp, then expand it.
-        if isinstance(fileids, str):
+        if isinstance(fileids, string_types):
             fileids = find_corpus_fileids(root, fileids)
 
         self._fileids = fileids
@@ -112,10 +117,10 @@ class CorpusReader(object):
 
     def __repr__(self):
         if isinstance(self._root, ZipFilePathPointer):
-            path = "%s/%s" % (self._root.zipfile.filename, self._root.entry)
+            path = '%s/%s' % (self._root.zipfile.filename, self._root.entry)
         else:
-            path = "%s" % self._root.path
-        return "<%s in %r>" % (self.__class__.__name__, path)
+            path = '%s' % self._root.path
+        return '<%s in %r>' % (self.__class__.__name__, path)
 
     def ensure_loaded(self):
         """
@@ -182,7 +187,7 @@ class CorpusReader(object):
         """
         if fileids is None:
             fileids = self._fileids
-        elif isinstance(fileids, str):
+        elif isinstance(fileids, string_types):
             fileids = [fileids]
 
         paths = [self._root.join(f) for f in fileids]
@@ -283,26 +288,26 @@ class CategorizedCorpusReader(object):
         self._file = None  #: fileid of file containing the mapping
         self._delimiter = None  #: delimiter for ``self._file``
 
-        if "cat_pattern" in kwargs:
-            self._pattern = kwargs["cat_pattern"]
-            del kwargs["cat_pattern"]
-        elif "cat_map" in kwargs:
-            self._map = kwargs["cat_map"]
-            del kwargs["cat_map"]
-        elif "cat_file" in kwargs:
-            self._file = kwargs["cat_file"]
-            del kwargs["cat_file"]
-            if "cat_delimiter" in kwargs:
-                self._delimiter = kwargs["cat_delimiter"]
-                del kwargs["cat_delimiter"]
+        if 'cat_pattern' in kwargs:
+            self._pattern = kwargs['cat_pattern']
+            del kwargs['cat_pattern']
+        elif 'cat_map' in kwargs:
+            self._map = kwargs['cat_map']
+            del kwargs['cat_map']
+        elif 'cat_file' in kwargs:
+            self._file = kwargs['cat_file']
+            del kwargs['cat_file']
+            if 'cat_delimiter' in kwargs:
+                self._delimiter = kwargs['cat_delimiter']
+                del kwargs['cat_delimiter']
         else:
             raise ValueError(
-                "Expected keyword argument cat_pattern or " "cat_map or cat_file."
+                'Expected keyword argument cat_pattern or ' 'cat_map or cat_file.'
             )
 
-        if "cat_pattern" in kwargs or "cat_map" in kwargs or "cat_file" in kwargs:
+        if 'cat_pattern' in kwargs or 'cat_map' in kwargs or 'cat_file' in kwargs:
             raise ValueError(
-                "Specify exactly one of: cat_pattern, " "cat_map, cat_file."
+                'Specify exactly one of: cat_pattern, ' 'cat_map, cat_file.'
             )
 
     def _init(self):
@@ -325,8 +330,8 @@ class CategorizedCorpusReader(object):
                 file_id, categories = line.split(self._delimiter, 1)
                 if file_id not in self.fileids():
                     raise ValueError(
-                        "In category mapping file %s: %s "
-                        "not found" % (self._file, file_id)
+                        'In category mapping file %s: %s '
+                        'not found' % (self._file, file_id)
                     )
                 for category in categories.split(self._delimiter):
                     self._add(file_id, category)
@@ -344,7 +349,7 @@ class CategorizedCorpusReader(object):
             self._init()
         if fileids is None:
             return sorted(self._c2f)
-        if isinstance(fileids, str):
+        if isinstance(fileids, string_types):
             fileids = [fileids]
         return sorted(set.union(*[self._f2c[d] for d in fileids]))
 
@@ -355,13 +360,13 @@ class CategorizedCorpusReader(object):
         """
         if categories is None:
             return super(CategorizedCorpusReader, self).fileids()
-        elif isinstance(categories, str):
+        elif isinstance(categories, string_types):
             if self._f2c is None:
                 self._init()
             if categories in self._c2f:
                 return sorted(self._c2f[categories])
             else:
-                raise ValueError("Category %s not found" % categories)
+                raise ValueError('Category %s not found' % categories)
         else:
             if self._f2c is None:
                 self._init()
@@ -403,7 +408,7 @@ class SyntaxCorpusReader(CorpusReader):
     def raw(self, fileids=None):
         if fileids is None:
             fileids = self._fileids
-        elif isinstance(fileids, str):
+        elif isinstance(fileids, string_types):
             fileids = [fileids]
         return concat([self.open(f).read() for f in fileids])
 
index 4f3f148..9d02754 100644 (file)
@@ -1,6 +1,6 @@
 # Natural Language Toolkit: Plaintext Corpus Reader
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Edward Loper <edloper@gmail.com>
 # URL: <http://nltk.org/>
 # For license information, see LICENSE.TXT
@@ -57,7 +57,7 @@ class BNCCorpusReader(XMLCorpusReader):
             word tokens.  Otherwise, leave the spaces on the tokens.
         :param stem: If true, then use word stems instead of word strings.
         """
-        tag = "c5" if c5 else "pos"
+        tag = 'c5' if c5 else 'pos'
         return self._views(fileids, False, tag, strip_space, stem)
 
     def sents(self, fileids=None, strip_space=True, stem=False):
@@ -85,7 +85,7 @@ class BNCCorpusReader(XMLCorpusReader):
             word tokens.  Otherwise, leave the spaces on the tokens.
         :param stem: If true, then use word stems instead of word strings.
         """
-        tag = "c5" if c5 else "pos"
+        tag = 'c5' if c5 else 'pos'
         return self._views(
             fileids, sent=True, tag=tag, strip_space=strip_space, stem=stem
         )
@@ -114,7 +114,7 @@ class BNCCorpusReader(XMLCorpusReader):
         result = []
 
         xmldoc = ElementTree.parse(fileid).getroot()
-        for xmlsent in xmldoc.findall(".//s"):
+        for xmlsent in xmldoc.findall('.//s'):
             sent = []
             for xmlword in _all_xmlwords_in(xmlsent):
                 word = xmlword.text
@@ -123,14 +123,14 @@ class BNCCorpusReader(XMLCorpusReader):
                 if strip_space or stem:
                     word = word.strip()
                 if stem:
-                    word = xmlword.get("hw", word)
-                if tag == "c5":
-                    word = (word, xmlword.get("c5"))
-                elif tag == "pos":
-                    word = (word, xmlword.get("pos", xmlword.get("c5")))
+                    word = xmlword.get('hw', word)
+                if tag == 'c5':
+                    word = (word, xmlword.get('c5'))
+                elif tag == 'pos':
+                    word = (word, xmlword.get('pos', xmlword.get('c5')))
                 sent.append(word)
             if bracket_sent:
-                result.append(BNCSentence(xmlsent.attrib["n"], sent))
+                result.append(BNCSentence(xmlsent.attrib['n'], sent))
             else:
                 result.extend(sent)
 
@@ -142,7 +142,7 @@ def _all_xmlwords_in(elt, result=None):
     if result is None:
         result = []
     for child in elt:
-        if child.tag in ("c", "w"):
+        if child.tag in ('c', 'w'):
             result.append(child)
         else:
             _all_xmlwords_in(child, result)
@@ -166,7 +166,7 @@ class BNCWordView(XMLCorpusView):
     """
 
     tags_to_ignore = set(
-        ["pb", "gap", "vocal", "event", "unclear", "shift", "pause", "align"]
+        ['pb', 'gap', 'vocal', 'event', 'unclear', 'shift', 'pause', 'align']
     )
     """These tags are ignored. For their description refer to the
     technical documentation, for example,
@@ -183,9 +183,9 @@ class BNCWordView(XMLCorpusView):
         :param stem: If true, then substitute stems for words.
         """
         if sent:
-            tagspec = ".*/s"
+            tagspec = '.*/s'
         else:
-            tagspec = ".*/s/(.*/)?(c|w)"
+            tagspec = '.*/s/(.*/)?(c|w)'
         self._sent = sent
         self._tag = tag
         self._strip_space = strip_space
@@ -200,7 +200,7 @@ class BNCWordView(XMLCorpusView):
 
         # Read in a tasty header.
         self._open()
-        self.read_block(self._stream, ".*/teiHeader$", self.handle_header)
+        self.read_block(self._stream, '.*/teiHeader$', self.handle_header)
         self.close()
 
         # Reset tag context.
@@ -208,22 +208,22 @@ class BNCWordView(XMLCorpusView):
 
     def handle_header(self, elt, context):
         # Set up some metadata!
-        titles = elt.findall("titleStmt/title")
+        titles = elt.findall('titleStmt/title')
         if titles:
-            self.title = "\n".join(title.text.strip() for title in titles)
+            self.title = '\n'.join(title.text.strip() for title in titles)
 
-        authors = elt.findall("titleStmt/author")
+        authors = elt.findall('titleStmt/author')
         if authors:
-            self.author = "\n".join(author.text.strip() for author in authors)
+            self.author = '\n'.join(author.text.strip() for author in authors)
 
-        editors = elt.findall("titleStmt/editor")
+        editors = elt.findall('titleStmt/editor')
         if editors:
-            self.editor = "\n".join(editor.text.strip() for editor in editors)
+            self.editor = '\n'.join(editor.text.strip() for editor in editors)
 
-        resps = elt.findall("titleStmt/respStmt")
+        resps = elt.findall('titleStmt/respStmt')
         if resps:
-            self.resps = "\n\n".join(
-                "\n".join(resp_elt.text.strip() for resp_elt in resp) for resp in resps
+            self.resps = '\n\n'.join(
+                '\n'.join(resp_elt.text.strip() for resp_elt in resp) for resp in resps
             )
 
     def handle_elt(self, elt, context):
@@ -239,20 +239,20 @@ class BNCWordView(XMLCorpusView):
         if self._strip_space or self._stem:
             word = word.strip()
         if self._stem:
-            word = elt.get("hw", word)
-        if self._tag == "c5":
-            word = (word, elt.get("c5"))
-        elif self._tag == "pos":
-            word = (word, elt.get("pos", elt.get("c5")))
+            word = elt.get('hw', word)
+        if self._tag == 'c5':
+            word = (word, elt.get('c5'))
+        elif self._tag == 'pos':
+            word = (word, elt.get('pos', elt.get('c5')))
         return word
 
     def handle_sent(self, elt):
         sent = []
         for child in elt:
-            if child.tag in ("mw", "hi", "corr", "trunc"):
+            if child.tag in ('mw', 'hi', 'corr', 'trunc'):
                 sent += [self.handle_word(w) for w in child]
-            elif child.tag in ("w", "c"):
+            elif child.tag in ('w', 'c'):
                 sent.append(self.handle_word(child))
             elif child.tag not in self.tags_to_ignore:
-                raise ValueError("Unexpected element %s" % child.tag)
-        return BNCSentence(elt.attrib["n"], sent)
+                raise ValueError('Unexpected element %s' % child.tag)
+        return BNCSentence(elt.attrib['n'], sent)
index 9a958c4..55093af 100644 (file)
@@ -1,6 +1,6 @@
 # Natural Language Toolkit: Penn Treebank Reader
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Steven Bird <stevenbird1@gmail.com>
 #         Edward Loper <edloper@gmail.com>
 # URL: <http://nltk.org/>
@@ -18,10 +18,10 @@ from nltk.corpus.reader.util import *
 from nltk.corpus.reader.api import *
 
 # we use [^\s()]+ instead of \S+? to avoid matching ()
-SORTTAGWRD = re.compile(r"\((\d+) ([^\s()]+) ([^\s()]+)\)")
-TAGWORD = re.compile(r"\(([^\s()]+) ([^\s()]+)\)")
-WORD = re.compile(r"\([^\s()]+ ([^\s()]+)\)")
-EMPTY_BRACKETS = re.compile(r"\s*\(\s*\(")
+SORTTAGWRD = re.compile(r'\((\d+) ([^\s()]+) ([^\s()]+)\)')
+TAGWORD = re.compile(r'\(([^\s()]+) ([^\s()]+)\)')
+WORD = re.compile(r'\([^\s()]+ ([^\s()]+)\)')
+EMPTY_BRACKETS = re.compile(r'\s*\(\s*\(')
 
 
 class BracketParseCorpusReader(SyntaxCorpusReader):
@@ -37,8 +37,8 @@ class BracketParseCorpusReader(SyntaxCorpusReader):
         root,
         fileids,
         comment_char=None,
-        detect_blocks="unindented_paren",
-        encoding="utf8",
+        detect_blocks='unindented_paren',
+        encoding='utf8',
         tagset=None,
     ):
         """
@@ -62,24 +62,28 @@ class BracketParseCorpusReader(SyntaxCorpusReader):
         self._tagset = tagset
 
     def _read_block(self, stream):
-        if self._detect_blocks == "sexpr":
+        if self._detect_blocks == 'sexpr':
             return read_sexpr_block(stream, comment_char=self._comment_char)
-        elif self._detect_blocks == "blankline":
+        elif self._detect_blocks == 'blankline':
             return read_blankline_block(stream)
-        elif self._detect_blocks == "unindented_paren":
+        elif self._detect_blocks == 'unindented_paren':
             # Tokens start with unindented left parens.
-            toks = read_regexp_block(stream, start_re=r"^\(")
+            toks = read_regexp_block(stream, start_re=r'^\(')
             # Strip any comments out of the tokens.
             if self._comment_char:
                 toks = [
-                    re.sub("(?m)^%s.*" % re.escape(self._comment_char), "", tok)
+                    re.sub('(?m)^%s.*' % re.escape(self._comment_char), '', tok)
                     for tok in toks
                 ]
             return toks
         else:
-            assert 0, "bad block type"
+            assert 0, 'bad block type'
 
     def _normalize(self, t):
+        # If there's an empty set of brackets surrounding the actual
+        # parse, then strip them off.
+        if EMPTY_BRACKETS.match(t):
+            t = t.strip()[1:-1]
         # Replace leaves of the form (!), (,), with (! !), (, ,)
         t = re.sub(r"\((.)\)", r"(\1 \1)", t)
         # Replace leaves of the form (tag word root) with (tag word)
@@ -88,20 +92,15 @@ class BracketParseCorpusReader(SyntaxCorpusReader):
 
     def _parse(self, t):
         try:
-            tree = Tree.fromstring(self._normalize(t))
-            # If there's an empty node at the top, strip it off
-            if tree.label() == '' and len(tree) == 1:
-                return tree[0]
-            else:
-                return tree
+            return Tree.fromstring(self._normalize(t))
 
         except ValueError as e:
             sys.stderr.write("Bad tree detected; trying to recover...\n")
             # Try to recover, if we can:
-            if e.args == ("mismatched parens",):
+            if e.args == ('mismatched parens',):
                 for n in range(1, 5):
                     try:
-                        v = Tree(self._normalize(t + ")" * n))
+                        v = Tree(self._normalize(t + ')' * n))
                         sys.stderr.write(
                             "  Recovered by adding %d close " "paren(s)\n" % n
                         )
@@ -111,7 +110,7 @@ class BracketParseCorpusReader(SyntaxCorpusReader):
             # Try something else:
             sys.stderr.write("  Recovered by returning a flat parse.\n")
             # sys.stderr.write(' '.join(t.split())+'\n')
-            return Tree("S", self._tag(t))
+            return Tree('S', self._tag(t))
 
     def _tag(self, t, tagset=None):
         tagged_sent = [(w, p) for (p, w) in TAGWORD.findall(self._normalize(t))]
@@ -148,7 +147,7 @@ class CategorizedBracketParseCorpusReader(
 
     def _resolve(self, fileids, categories):
         if fileids is not None and categories is not None:
-            raise ValueError("Specify fileids or categories, not both")
+            raise ValueError('Specify fileids or categories, not both')
         if categories is not None:
             return self.fileids(categories)
         else:
@@ -208,12 +207,12 @@ class AlpinoCorpusReader(BracketParseCorpusReader):
     untouched.
     """
 
-    def __init__(self, root, encoding="ISO-8859-1", tagset=None):
+    def __init__(self, root, encoding='ISO-8859-1', tagset=None):
         BracketParseCorpusReader.__init__(
             self,
             root,
-            "alpino\.xml",
-            detect_blocks="blankline",
+            'alpino\.xml',
+            detect_blocks='blankline',
             encoding=encoding,
             tagset=tagset,
         )
index 0c597d5..e0a3034 100644 (file)
@@ -1,6 +1,6 @@
 # Natural Language Toolkit: Categorized Sentences Corpus Reader
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Pierpaolo Pantone <24alsecondo@gmail.com>
 # URL: <http://nltk.org/>
 # For license information, see LICENSE.TXT
@@ -34,6 +34,7 @@ Related papers:
     sentiment categorization with respect to rating scales". Proceedings of the
     ACL, 2005.
 """
+from six import string_types
 
 from nltk.corpus.reader.api import *
 from nltk.tokenize import *
@@ -78,7 +79,7 @@ class CategorizedSentencesCorpusReader(CategorizedCorpusReader, CorpusReader):
         fileids,
         word_tokenizer=WhitespaceTokenizer(),
         sent_tokenizer=None,
-        encoding="utf8",
+        encoding='utf8',
         **kwargs
     ):
         """
@@ -98,7 +99,7 @@ class CategorizedSentencesCorpusReader(CategorizedCorpusReader, CorpusReader):
 
     def _resolve(self, fileids, categories):
         if fileids is not None and categories is not None:
-            raise ValueError("Specify fileids or categories, not both")
+            raise ValueError('Specify fileids or categories, not both')
         if categories is not None:
             return self.fileids(categories)
         else:
@@ -116,7 +117,7 @@ class CategorizedSentencesCorpusReader(CategorizedCorpusReader, CorpusReader):
         fileids = self._resolve(fileids, categories)
         if fileids is None:
             fileids = self._fileids
-        elif isinstance(fileids, str):
+        elif isinstance(fileids, string_types):
             fileids = [fileids]
         return concat([self.open(f).read() for f in fileids])
 
@@ -141,7 +142,7 @@ class CategorizedSentencesCorpusReader(CategorizedCorpusReader, CorpusReader):
         fileids = self._resolve(fileids, categories)
         if fileids is None:
             fileids = self._fileids
-        elif isinstance(fileids, str):
+        elif isinstance(fileids, string_types):
             fileids = [fileids]
         return concat(
             [
@@ -165,7 +166,7 @@ class CategorizedSentencesCorpusReader(CategorizedCorpusReader, CorpusReader):
         fileids = self._resolve(fileids, categories)
         if fileids is None:
             fileids = self._fileids
-        elif isinstance(fileids, str):
+        elif isinstance(fileids, string_types):
             fileids = [fileids]
         return concat(
             [
index 0d0cc5e..ef60b0d 100644 (file)
@@ -1,13 +1,16 @@
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Masato Hagiwara <hagisan@gmail.com>
 # URL: <http://nltk.org/>
 # For license information, see LICENSE.TXT
 
 # For more information, see http://lilyx.net/pages/nltkjapanesecorpus.html
+from __future__ import print_function
 
 import sys
 
+from six import string_types
+
 from nltk.corpus.reader import util
 
 from nltk.corpus.reader.util import *
@@ -15,14 +18,14 @@ from nltk.corpus.reader.api import *
 
 
 class ChasenCorpusReader(CorpusReader):
-    def __init__(self, root, fileids, encoding="utf8", sent_splitter=None):
+    def __init__(self, root, fileids, encoding='utf8', sent_splitter=None):
         self._sent_splitter = sent_splitter
         CorpusReader.__init__(self, root, fileids, encoding)
 
     def raw(self, fileids=None):
         if fileids is None:
             fileids = self._fileids
-        elif isinstance(fileids, str):
+        elif isinstance(fileids, string_types):
             fileids = [fileids]
         return concat([self.open(f).read() for f in fileids])
 
@@ -106,9 +109,9 @@ class ChasenCorpusView(StreamBackedCorpusView):
             sent = []
             for line in para_str.splitlines():
 
-                _eos = line.strip() == "EOS"
-                _cells = line.split("\t")
-                w = (_cells[0], "\t".join(_cells[1:]))
+                _eos = line.strip() == 'EOS'
+                _cells = line.split('\t')
+                w = (_cells[0], '\t'.join(_cells[1:]))
                 if not _eos:
                     sent.append(w)
 
@@ -143,12 +146,12 @@ def demo():
     import nltk
     from nltk.corpus.util import LazyCorpusLoader
 
-    jeita = LazyCorpusLoader("jeita", ChasenCorpusReader, r".*chasen", encoding="utf-8")
-    print("/".join(jeita.words()[22100:22140]))
+    jeita = LazyCorpusLoader('jeita', ChasenCorpusReader, r'.*chasen', encoding='utf-8')
+    print('/'.join(jeita.words()[22100:22140]))
 
     print(
-        "\nEOS\n".join(
-            "\n".join("%s/%s" % (w[0], w[1].split("\t")[2]) for w in sent)
+        '\nEOS\n'.join(
+            '\n'.join("%s/%s" % (w[0], w[1].split('\t')[2]) for w in sent)
             for sent in jeita.tagged_sents()[2170:2173]
         )
     )
@@ -158,11 +161,11 @@ def test():
 
     from nltk.corpus.util import LazyCorpusLoader
 
-    jeita = LazyCorpusLoader("jeita", ChasenCorpusReader, r".*chasen", encoding="utf-8")
+    jeita = LazyCorpusLoader('jeita', ChasenCorpusReader, r'.*chasen', encoding='utf-8')
 
-    assert isinstance(jeita.tagged_words()[0][1], str)
+    assert isinstance(jeita.tagged_words()[0][1], string_types)
 
 
-if __name__ == "__main__":
+if __name__ == '__main__':
     demo()
     test()
index 1d163c6..7c5faa9 100644 (file)
@@ -1,6 +1,6 @@
 # CHILDES XML Corpus Reader
 
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Tomonori Nagano <tnagano@gc.cuny.edu>
 #         Alexis Dimitriadis <A.Dimitriadis@uu.nl>
 # URL: <http://nltk.org/>
@@ -9,11 +9,13 @@
 """
 Corpus reader for the XML version of the CHILDES corpus.
 """
+from __future__ import print_function, division
 
-__docformat__ = "epytext en"
+__docformat__ = 'epytext en'
 
 import re
 from collections import defaultdict
+from six import string_types
 
 from nltk.util import flatten, LazyMap, LazyConcatenation
 
@@ -21,14 +23,14 @@ from nltk.corpus.reader.util import concat
 from nltk.corpus.reader.xmldocs import XMLCorpusReader, ElementTree
 
 # to resolve the namespace issue
-NS = "http://www.talkbank.org/ns/talkbank"
+NS = 'http://www.talkbank.org/ns/talkbank'
 
 
 class CHILDESCorpusReader(XMLCorpusReader):
     """
     Corpus reader for the XML version of the CHILDES corpus.
-    The CHILDES corpus is available at ``https://childes.talkbank.org/``. The XML
-    version of CHILDES is located at ``https://childes.talkbank.org/data-xml/``.
+    The CHILDES corpus is available at ``http://childes.psy.cmu.edu/``. The XML
+    version of CHILDES is located at ``http://childes.psy.cmu.edu/data-xml/``.
     Copy the needed parts of the CHILDES XML corpus into the NLTK data directory
     (``nltk_data/corpora/CHILDES/``).
 
@@ -43,7 +45,7 @@ class CHILDESCorpusReader(XMLCorpusReader):
     def words(
         self,
         fileids=None,
-        speaker="ALL",
+        speaker='ALL',
         stem=False,
         relation=False,
         strip_space=True,
@@ -83,7 +85,7 @@ class CHILDESCorpusReader(XMLCorpusReader):
     def tagged_words(
         self,
         fileids=None,
-        speaker="ALL",
+        speaker='ALL',
         stem=False,
         relation=False,
         strip_space=True,
@@ -125,7 +127,7 @@ class CHILDESCorpusReader(XMLCorpusReader):
     def sents(
         self,
         fileids=None,
-        speaker="ALL",
+        speaker='ALL',
         stem=False,
         relation=None,
         strip_space=True,
@@ -167,7 +169,7 @@ class CHILDESCorpusReader(XMLCorpusReader):
     def tagged_sents(
         self,
         fileids=None,
-        speaker="ALL",
+        speaker='ALL',
         stem=False,
         relation=None,
         strip_space=True,
@@ -241,13 +243,13 @@ class CHILDESCorpusReader(XMLCorpusReader):
         # getting participants' data
         pat = dictOfDicts()
         for participant in xmldoc.findall(
-            ".//{%s}Participants/{%s}participant" % (NS, NS)
+            './/{%s}Participants/{%s}participant' % (NS, NS)
         ):
             for (key, value) in participant.items():
-                pat[participant.get("id")][key] = value
+                pat[participant.get('id')][key] = value
         return pat
 
-    def age(self, fileids=None, speaker="CHI", month=False):
+    def age(self, fileids=None, speaker='CHI', month=False):
         """
         :return: the given file(s) as string or int
         :rtype: list or int
@@ -264,10 +266,10 @@ class CHILDESCorpusReader(XMLCorpusReader):
 
     def _get_age(self, fileid, speaker, month):
         xmldoc = ElementTree.parse(fileid).getroot()
-        for pat in xmldoc.findall(".//{%s}Participants/{%s}participant" % (NS, NS)):
+        for pat in xmldoc.findall('.//{%s}Participants/{%s}participant' % (NS, NS)):
             try:
-                if pat.get("id") == speaker:
-                    age = pat.get("age")
+                if pat.get('id') == speaker:
+                    age = pat.get('age')
                     if month:
                         age = self.convert_age(age)
                     return age
@@ -287,7 +289,7 @@ class CHILDESCorpusReader(XMLCorpusReader):
             pass
         return age_month
 
-    def MLU(self, fileids=None, speaker="CHI"):
+    def MLU(self, fileids=None, speaker='CHI'):
         """
         :return: the given file(s) as a floating number
         :rtype: list(float)
@@ -318,7 +320,7 @@ class CHILDESCorpusReader(XMLCorpusReader):
         for sent in sents:
             posList = [pos for (word, pos) in sent]
             # if any part of the sentence is intelligible
-            if any(pos == "unk" for pos in posList):
+            if any(pos == 'unk' for pos in posList):
                 continue
             # if the sentence is null
             elif sent == []:
@@ -329,8 +331,8 @@ class CHILDESCorpusReader(XMLCorpusReader):
             else:
                 results.append([word for (word, pos) in sent])
                 # count number of fillers
-                if len(set(["co", None]).intersection(posList)) > 0:
-                    numFillers += posList.count("co")
+                if len(set(['co', None]).intersection(posList)) > 0:
+                    numFillers += posList.count('co')
                     numFillers += posList.count(None)
                     sentDiscount += 1
             lastSent = sent
@@ -339,7 +341,7 @@ class CHILDESCorpusReader(XMLCorpusReader):
             # count number of morphemes
             # (e.g., 'read' = 1 morpheme but 'read-PAST' is 2 morphemes)
             numWords = (
-                len(flatten([word.split("-") for word in thisWordList])) - numFillers
+                len(flatten([word.split('-') for word in thisWordList])) - numFillers
             )
             numSents = len(results) - sentDiscount
             mlu = numWords / numSents
@@ -352,54 +354,54 @@ class CHILDESCorpusReader(XMLCorpusReader):
         self, fileid, speaker, sent, stem, relation, pos, strip_space, replace
     ):
         if (
-            isinstance(speaker, str) and speaker != "ALL"
+            isinstance(speaker, string_types) and speaker != 'ALL'
         ):  # ensure we have a list of speakers
             speaker = [speaker]
         xmldoc = ElementTree.parse(fileid).getroot()
         # processing each xml doc
         results = []
-        for xmlsent in xmldoc.findall(".//{%s}u" % NS):
+        for xmlsent in xmldoc.findall('.//{%s}u' % NS):
             sents = []
             # select speakers
-            if speaker == "ALL" or xmlsent.get("who") in speaker:
-                for xmlword in xmlsent.findall(".//{%s}w" % NS):
+            if speaker == 'ALL' or xmlsent.get('who') in speaker:
+                for xmlword in xmlsent.findall('.//{%s}w' % NS):
                     infl = None
                     suffixStem = None
                     suffixTag = None
                     # getting replaced words
-                    if replace and xmlsent.find(".//{%s}w/{%s}replacement" % (NS, NS)):
+                    if replace and xmlsent.find('.//{%s}w/{%s}replacement' % (NS, NS)):
                         xmlword = xmlsent.find(
-                            ".//{%s}w/{%s}replacement/{%s}w" % (NS, NS, NS)
+                            './/{%s}w/{%s}replacement/{%s}w' % (NS, NS, NS)
                         )
-                    elif replace and xmlsent.find(".//{%s}w/{%s}wk" % (NS, NS)):
-                        xmlword = xmlsent.find(".//{%s}w/{%s}wk" % (NS, NS))
+                    elif replace and xmlsent.find('.//{%s}w/{%s}wk' % (NS, NS)):
+                        xmlword = xmlsent.find('.//{%s}w/{%s}wk' % (NS, NS))
                     # get text
                     if xmlword.text:
                         word = xmlword.text
                     else:
-                        word = ""
+                        word = ''
                     # strip tailing space
                     if strip_space:
                         word = word.strip()
                     # stem
                     if relation or stem:
                         try:
-                            xmlstem = xmlword.find(".//{%s}stem" % NS)
+                            xmlstem = xmlword.find('.//{%s}stem' % NS)
                             word = xmlstem.text
                         except AttributeError as e:
                             pass
                         # if there is an inflection
                         try:
                             xmlinfl = xmlword.find(
-                                ".//{%s}mor/{%s}mw/{%s}mk" % (NS, NS, NS)
+                                './/{%s}mor/{%s}mw/{%s}mk' % (NS, NS, NS)
                             )
-                            word += "-" + xmlinfl.text
+                            word += '-' + xmlinfl.text
                         except:
                             pass
                         # if there is a suffix
                         try:
                             xmlsuffix = xmlword.find(
-                                ".//{%s}mor/{%s}mor-post/{%s}mw/{%s}stem"
+                                './/{%s}mor/{%s}mor-post/{%s}mw/{%s}stem'
                                 % (NS, NS, NS, NS)
                             )
                             suffixStem = xmlsuffix.text
@@ -420,11 +422,11 @@ class CHILDESCorpusReader(XMLCorpusReader):
                             tag = ""
                         try:
                             xmlsuffixpos = xmlword.findall(
-                                ".//{%s}mor/{%s}mor-post/{%s}mw/{%s}pos/{%s}c"
+                                './/{%s}mor/{%s}mor-post/{%s}mw/{%s}pos/{%s}c'
                                 % (NS, NS, NS, NS, NS)
                             )
                             xmlsuffixpos2 = xmlword.findall(
-                                ".//{%s}mor/{%s}mor-post/{%s}mw/{%s}pos/{%s}s"
+                                './/{%s}mor/{%s}mor-post/{%s}mw/{%s}pos/{%s}s'
                                 % (NS, NS, NS, NS, NS)
                             )
                             if xmlsuffixpos2:
@@ -443,17 +445,17 @@ class CHILDESCorpusReader(XMLCorpusReader):
                     # <mor></mor><mor type="trn"><gra type="grt">
                     if relation == True:
                         for xmlstem_rel in xmlword.findall(
-                            ".//{%s}mor/{%s}gra" % (NS, NS)
+                            './/{%s}mor/{%s}gra' % (NS, NS)
                         ):
-                            if not xmlstem_rel.get("type") == "grt":
+                            if not xmlstem_rel.get('type') == 'grt':
                                 word = (
                                     word[0],
                                     word[1],
-                                    xmlstem_rel.get("index")
+                                    xmlstem_rel.get('index')
                                     + "|"
-                                    + xmlstem_rel.get("head")
+                                    + xmlstem_rel.get('head')
                                     + "|"
-                                    + xmlstem_rel.get("relation"),
+                                    + xmlstem_rel.get('relation'),
                                 )
                             else:
                                 word = (
@@ -462,25 +464,25 @@ class CHILDESCorpusReader(XMLCorpusReader):
                                     word[2],
                                     word[0],
                                     word[1],
-                                    xmlstem_rel.get("index")
+                                    xmlstem_rel.get('index')
                                     + "|"
-                                    + xmlstem_rel.get("head")
+                                    + xmlstem_rel.get('head')
                                     + "|"
-                                    + xmlstem_rel.get("relation"),
+                                    + xmlstem_rel.get('relation'),
                                 )
                         try:
                             for xmlpost_rel in xmlword.findall(
-                                ".//{%s}mor/{%s}mor-post/{%s}gra" % (NS, NS, NS)
+                                './/{%s}mor/{%s}mor-post/{%s}gra' % (NS, NS, NS)
                             ):
-                                if not xmlpost_rel.get("type") == "grt":
+                                if not xmlpost_rel.get('type') == 'grt':
                                     suffixStem = (
                                         suffixStem[0],
                                         suffixStem[1],
-                                        xmlpost_rel.get("index")
+                                        xmlpost_rel.get('index')
                                         + "|"
-                                        + xmlpost_rel.get("head")
+                                        + xmlpost_rel.get('head')
                                         + "|"
-                                        + xmlpost_rel.get("relation"),
+                                        + xmlpost_rel.get('relation'),
                                     )
                                 else:
                                     suffixStem = (
@@ -489,11 +491,11 @@ class CHILDESCorpusReader(XMLCorpusReader):
                                         suffixStem[2],
                                         suffixStem[0],
                                         suffixStem[1],
-                                        xmlpost_rel.get("index")
+                                        xmlpost_rel.get('index')
                                         + "|"
-                                        + xmlpost_rel.get("head")
+                                        + xmlpost_rel.get('head')
                                         + "|"
-                                        + xmlpost_rel.get("relation"),
+                                        + xmlpost_rel.get('relation'),
                                     )
                         except:
                             pass
@@ -511,7 +513,7 @@ class CHILDESCorpusReader(XMLCorpusReader):
     shouldn't need to be changed, unless CHILDES changes the configuration
     of their server or unless the user sets up their own corpus webserver.
     """
-    childes_url_base = r"https://childes.talkbank.org/browser/index.php?url="
+    childes_url_base = r'http://childes.psy.cmu.edu/browser/index.php?url='
 
     def webview_file(self, fileid, urlbase=None):
         """Map a corpus file to its web version on the CHILDES website,
@@ -534,27 +536,27 @@ class CHILDESCorpusReader(XMLCorpusReader):
         corpus root points to the Cornell folder, urlbase='Eng-USA/Cornell'.
         """
 
-        import webbrowser
+        import webbrowser, re
 
         if urlbase:
             path = urlbase + "/" + fileid
         else:
             full = self.root + "/" + fileid
-            full = re.sub(r"\\", "/", full)
-            if "/childes/" in full.lower():
+            full = re.sub(r'\\', '/', full)
+            if '/childes/' in full.lower():
                 # Discard /data-xml/ if present
-                path = re.findall(r"(?i)/childes(?:/data-xml)?/(.*)\.xml", full)[0]
-            elif "eng-usa" in full.lower():
-                path = "Eng-USA/" + re.findall(r"/(?i)Eng-USA/(.*)\.xml", full)[0]
+                path = re.findall(r'(?i)/childes(?:/data-xml)?/(.*)\.xml', full)[0]
+            elif 'eng-usa' in full.lower():
+                path = 'Eng-USA/' + re.findall(r'/(?i)Eng-USA/(.*)\.xml', full)[0]
             else:
                 path = fileid
 
         # Strip ".xml" and add ".cha", as necessary:
-        if path.endswith(".xml"):
+        if path.endswith('.xml'):
             path = path[:-4]
 
-        if not path.endswith(".cha"):
-            path = path + ".cha"
+        if not path.endswith('.cha'):
+            path = path + '.cha'
 
         url = self.childes_url_base + path
 
@@ -572,20 +574,20 @@ def demo(corpus_root=None):
     if not corpus_root:
         from nltk.data import find
 
-        corpus_root = find("corpora/childes/data-xml/Eng-USA/")
+        corpus_root = find('corpora/childes/data-xml/Eng-USA/')
 
     try:
-        childes = CHILDESCorpusReader(corpus_root, ".*.xml")
+        childes = CHILDESCorpusReader(corpus_root, '.*.xml')
         # describe all corpus
         for file in childes.fileids()[:5]:
-            corpus = ""
-            corpus_id = ""
+            corpus = ''
+            corpus_id = ''
             for (key, value) in childes.corpus(file)[0].items():
                 if key == "Corpus":
                     corpus = value
                 if key == "Id":
                     corpus_id = value
-            print("Reading", corpus, corpus_id, " .....")
+            print('Reading', corpus, corpus_id, ' .....')
             print("words:", childes.words(file)[:7], "...")
             print(
                 "words with replaced words:",
@@ -593,8 +595,8 @@ def demo(corpus_root=None):
                 " ...",
             )
             print("words with pos tags:", childes.tagged_words(file)[:7], " ...")
-            print("words (only MOT):", childes.words(file, speaker="MOT")[:7], "...")
-            print("words (only CHI):", childes.words(file, speaker="CHI")[:7], "...")
+            print("words (only MOT):", childes.words(file, speaker='MOT')[:7], "...")
+            print("words (only CHI):", childes.words(file, speaker='CHI')[:7], "...")
             print("stemmed words:", childes.words(file, stem=True)[:7], " ...")
             print(
                 "words with relations and pos-tag:",
@@ -615,13 +617,13 @@ def demo(corpus_root=None):
     except LookupError as e:
         print(
             """The CHILDES corpus, or the parts you need, should be manually
-        downloaded from https://childes.talkbank.org/data-xml/ and saved at
+        downloaded from http://childes.psy.cmu.edu/data-xml/ and saved at
         [NLTK_Data_Dir]/corpora/childes/
             Alternately, you can call the demo with the path to a portion of the CHILDES corpus, e.g.:
         demo('/path/to/childes/data-xml/Eng-USA/")
         """
         )
-        # corpus_root_http = urllib2.urlopen('https://childes.talkbank.org/data-xml/Eng-USA/Bates.zip')
+        # corpus_root_http = urllib2.urlopen('http://childes.psy.cmu.edu/data-xml/Eng-USA/Bates.zip')
         # corpus_root_http_bates = zipfile.ZipFile(cStringIO.StringIO(corpus_root_http.read()))
         ##this fails
         # childes = CHILDESCorpusReader(corpus_root_http_bates,corpus_root_http_bates.namelist())
index bb32832..0edd0ea 100644 (file)
@@ -1,6 +1,6 @@
 # Natural Language Toolkit: Chunked Corpus Reader
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Steven Bird <stevenbird1@gmail.com>
 #         Edward Loper <edloper@gmail.com>
 # URL: <http://nltk.org/>
@@ -13,6 +13,8 @@ documents.
 
 import os.path, codecs
 
+from six import string_types
+
 import nltk
 from nltk.corpus.reader.bracket_parse import BracketParseCorpusReader
 from nltk.tree import Tree
@@ -38,11 +40,11 @@ class ChunkedCorpusReader(CorpusReader):
         self,
         root,
         fileids,
-        extension="",
+        extension='',
         str2chunktree=tagstr2tree,
-        sent_tokenizer=RegexpTokenizer("\n", gaps=True),
+        sent_tokenizer=RegexpTokenizer('\n', gaps=True),
         para_block_reader=read_blankline_block,
-        encoding="utf8",
+        encoding='utf8',
         tagset=None,
     ):
         """
@@ -61,7 +63,7 @@ class ChunkedCorpusReader(CorpusReader):
         """
         if fileids is None:
             fileids = self._fileids
-        elif isinstance(fileids, str):
+        elif isinstance(fileids, string_types):
             fileids = [fileids]
         return concat([self.open(f).read() for f in fileids])
 
@@ -279,5 +281,5 @@ class ChunkedCorpusView(StreamBackedCorpusView):
             elif isinstance(child, tuple):
                 tree[i] = child[0]
             else:
-                raise ValueError("expected child to be Tree or tuple")
+                raise ValueError('expected child to be Tree or tuple')
         return tree
index ba1cdf9..a4aef7d 100644 (file)
@@ -1,6 +1,6 @@
 # Natural Language Toolkit: Carnegie Mellon Pronouncing Dictionary Corpus Reader
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Steven Bird <stevenbird1@gmail.com>
 # URL: <http://nltk.org/>
 # For license information, see LICENSE.TXT
@@ -45,6 +45,7 @@ Y       yield   Y IY L D       Z       zee     Z IY
 ZH      seizure S IY ZH ER
 """
 
+from nltk import compat
 from nltk.util import Index
 
 from nltk.corpus.reader.util import *
@@ -69,7 +70,7 @@ class CMUDictCorpusReader(CorpusReader):
         :return: the cmudict lexicon as a raw string.
         """
         fileids = self._fileids
-        if isinstance(fileids, str):
+        if isinstance(fileids, string_types):
             fileids = [fileids]
         return concat([self.open(f).read() for f in fileids])
 
@@ -91,7 +92,7 @@ def read_cmudict_block(stream):
     entries = []
     while len(entries) < 100:  # Read 100 at a time.
         line = stream.readline()
-        if line == "":
+        if line == '':
             return entries  # end of file.
         pieces = line.split()
         entries.append((pieces[0].lower(), pieces[2:]))
index 9d6fcdb..30d00cc 100644 (file)
@@ -1,6 +1,6 @@
 # Natural Language Toolkit: Comparative Sentence Corpus Reader
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Pierpaolo Pantone <24alsecondo@gmail.com>
 # URL: <http://nltk.org/>
 # For license information, see LICENSE.TXT
@@ -35,17 +35,19 @@ Related papers:
 """
 import re
 
+from six import string_types
+
 from nltk.corpus.reader.api import *
 from nltk.tokenize import *
 
 # Regular expressions for dataset components
-STARS = re.compile(r"^\*+$")
-COMPARISON = re.compile(r"<cs-[1234]>")
-CLOSE_COMPARISON = re.compile(r"</cs-[1234]>")
-GRAD_COMPARISON = re.compile(r"<cs-[123]>")
-NON_GRAD_COMPARISON = re.compile(r"<cs-4>")
+STARS = re.compile(r'^\*+$')
+COMPARISON = re.compile(r'<cs-[1234]>')
+CLOSE_COMPARISON = re.compile(r'</cs-[1234]>')
+GRAD_COMPARISON = re.compile(r'<cs-[123]>')
+NON_GRAD_COMPARISON = re.compile(r'<cs-4>')
 ENTITIES_FEATS = re.compile(r"(\d)_((?:[\.\w\s/-](?!\d_))+)")
-KEYWORD = re.compile(r"\((?!.*\()(.*)\)$")
+KEYWORD = re.compile(r'\((?!.*\()(.*)\)$')
 
 
 class Comparison(object):
@@ -81,8 +83,8 @@ class Comparison(object):
 
     def __repr__(self):
         return (
-            'Comparison(text="{}", comp_type={}, entity_1="{}", entity_2="{}", '
-            'feature="{}", keyword="{}")'
+            "Comparison(text=\"{}\", comp_type={}, entity_1=\"{}\", entity_2=\"{}\", "
+            "feature=\"{}\", keyword=\"{}\")"
         ).format(
             self.text,
             self.comp_type,
@@ -119,7 +121,7 @@ class ComparativeSentencesCorpusReader(CorpusReader):
         fileids,
         word_tokenizer=WhitespaceTokenizer(),
         sent_tokenizer=None,
-        encoding="utf8",
+        encoding='utf8',
     ):
         """
         :param root: The root directory for this corpus.
@@ -145,7 +147,7 @@ class ComparativeSentencesCorpusReader(CorpusReader):
         """
         if fileids is None:
             fileids = self._fileids
-        elif isinstance(fileids, str):
+        elif isinstance(fileids, string_types):
             fileids = [fileids]
         return concat(
             [
@@ -195,7 +197,7 @@ class ComparativeSentencesCorpusReader(CorpusReader):
         """
         if fileids is None:
             fileids = self._fileids
-        elif isinstance(fileids, str):
+        elif isinstance(fileids, string_types):
             fileids = [fileids]
         return concat([self.open(f).read() for f in fileids])
 
@@ -259,7 +261,7 @@ class ComparativeSentencesCorpusReader(CorpusReader):
                 if grad_comparisons:
                     # Each comparison tag has its own relations on a separate line
                     for comp in grad_comparisons:
-                        comp_type = int(re.match(r"<cs-(\d)>", comp).group(1))
+                        comp_type = int(re.match(r'<cs-(\d)>', comp).group(1))
                         comparison = Comparison(
                             text=comparison_text, comp_type=comp_type
                         )
@@ -267,11 +269,11 @@ class ComparativeSentencesCorpusReader(CorpusReader):
                         entities_feats = ENTITIES_FEATS.findall(line)
                         if entities_feats:
                             for (code, entity_feat) in entities_feats:
-                                if code == "1":
+                                if code == '1':
                                     comparison.entity_1 = entity_feat.strip()
-                                elif code == "2":
+                                elif code == '2':
                                     comparison.entity_2 = entity_feat.strip()
-                                elif code == "3":
+                                elif code == '3':
                                     comparison.feature = entity_feat.strip()
                         keyword = KEYWORD.findall(line)
                         if keyword:
@@ -282,7 +284,7 @@ class ComparativeSentencesCorpusReader(CorpusReader):
                 if non_grad_comparisons:
                     for comp in non_grad_comparisons:
                         # comp_type in this case should always be 4.
-                        comp_type = int(re.match(r"<cs-(\d)>", comp).group(1))
+                        comp_type = int(re.match(r'<cs-(\d)>', comp).group(1))
                         comparison = Comparison(
                             text=comparison_text, comp_type=comp_type
                         )
index e138a1b..26849be 100644 (file)
@@ -1,6 +1,6 @@
 # Natural Language Toolkit: CONLL Corpus Reader
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Steven Bird <stevenbird1@gmail.com>
 #         Edward Loper <edloper@gmail.com>
 # URL: <http://nltk.org/>
 Read CoNLL-style chunk fileids.
 """
 
+from __future__ import unicode_literals
+
 import textwrap
 
+from nltk import compat
 from nltk.tree import Tree
 from nltk.util import LazyMap, LazyConcatenation
 from nltk.tag import map_tag
@@ -50,13 +53,13 @@ class ConllCorpusReader(CorpusReader):
     # Column Types
     # /////////////////////////////////////////////////////////////////
 
-    WORDS = "words"  #: column type for words
-    POS = "pos"  #: column type for part-of-speech tags
-    TREE = "tree"  #: column type for parse trees
-    CHUNK = "chunk"  #: column type for chunk structures
-    NE = "ne"  #: column type for named entities
-    SRL = "srl"  #: column type for semantic role labels
-    IGNORE = "ignore"  #: column type for column that should be ignored
+    WORDS = 'words'  #: column type for words
+    POS = 'pos'  #: column type for part-of-speech tags
+    TREE = 'tree'  #: column type for parse trees
+    CHUNK = 'chunk'  #: column type for chunk structures
+    NE = 'ne'  #: column type for named entities
+    SRL = 'srl'  #: column type for semantic role labels
+    IGNORE = 'ignore'  #: column type for column that should be ignored
 
     #: A list of all column types supported by the conll corpus reader.
     COLUMN_TYPES = (WORDS, POS, TREE, CHUNK, NE, SRL, IGNORE)
@@ -71,18 +74,18 @@ class ConllCorpusReader(CorpusReader):
         fileids,
         columntypes,
         chunk_types=None,
-        root_label="S",
+        root_label='S',
         pos_in_tree=False,
         srl_includes_roleset=True,
-        encoding="utf8",
+        encoding='utf8',
         tree_class=Tree,
         tagset=None,
         separator=None,
     ):
         for columntype in columntypes:
             if columntype not in self.COLUMN_TYPES:
-                raise ValueError("Bad column type %r" % columntype)
-        if isinstance(chunk_types, str):
+                raise ValueError('Bad column type %r' % columntype)
+        if isinstance(chunk_types, string_types):
             chunk_types = [chunk_types]
         self._chunk_types = chunk_types
         self._colmap = dict((c, i) for (i, c) in enumerate(columntypes))
@@ -101,7 +104,7 @@ class ConllCorpusReader(CorpusReader):
     def raw(self, fileids=None):
         if fileids is None:
             fileids = self._fileids
-        elif isinstance(fileids, str):
+        elif isinstance(fileids, string_types):
             fileids = [fileids]
         return concat([self.open(f).read() for f in fileids])
 
@@ -226,17 +229,17 @@ class ConllCorpusReader(CorpusReader):
             if not block:
                 continue
 
-            grid = [line.split(self.sep) for line in block.split("\n")]
+            grid = [line.split(self.sep) for line in block.split('\n')]
 
             # If there's a docstart row, then discard. ([xx] eventually it
             # would be good to actually use it)
-            if grid[0][self._colmap.get("words", 0)] == "-DOCSTART-":
+            if grid[0][self._colmap.get('words', 0)] == '-DOCSTART-':
                 del grid[0]
 
             # Check that the grid is consistent.
             for row in grid:
                 if len(row) != len(grid[0]):
-                    raise ValueError("Inconsistent number of columns:\n%s" % block)
+                    raise ValueError('Inconsistent number of columns:\n%s' % block)
             grids.append(grid)
         return grids
 
@@ -247,52 +250,52 @@ class ConllCorpusReader(CorpusReader):
     # a list of words or a parse tree).
 
     def _get_words(self, grid):
-        return self._get_column(grid, self._colmap["words"])
+        return self._get_column(grid, self._colmap['words'])
 
     def _get_tagged_words(self, grid, tagset=None):
-        pos_tags = self._get_column(grid, self._colmap["pos"])
+        pos_tags = self._get_column(grid, self._colmap['pos'])
         if tagset and tagset != self._tagset:
             pos_tags = [map_tag(self._tagset, tagset, t) for t in pos_tags]
-        return list(zip(self._get_column(grid, self._colmap["words"]), pos_tags))
+        return list(zip(self._get_column(grid, self._colmap['words']), pos_tags))
 
     def _get_iob_words(self, grid, tagset=None):
-        pos_tags = self._get_column(grid, self._colmap["pos"])
+        pos_tags = self._get_column(grid, self._colmap['pos'])
         if tagset and tagset != self._tagset:
             pos_tags = [map_tag(self._tagset, tagset, t) for t in pos_tags]
         return list(
             zip(
-                self._get_column(grid, self._colmap["words"]),
+                self._get_column(grid, self._colmap['words']),
                 pos_tags,
-                self._get_column(grid, self._colmap["chunk"]),
+                self._get_column(grid, self._colmap['chunk']),
             )
         )
 
     def _get_chunked_words(self, grid, chunk_types, tagset=None):
         # n.b.: this method is very similar to conllstr2tree.
-        words = self._get_column(grid, self._colmap["words"])
-        pos_tags = self._get_column(grid, self._colmap["pos"])
+        words = self._get_column(grid, self._colmap['words'])
+        pos_tags = self._get_column(grid, self._colmap['pos'])
         if tagset and tagset != self._tagset:
             pos_tags = [map_tag(self._tagset, tagset, t) for t in pos_tags]
-        chunk_tags = self._get_column(grid, self._colmap["chunk"])
+        chunk_tags = self._get_column(grid, self._colmap['chunk'])
 
         stack = [Tree(self._root_label, [])]
 
         for (word, pos_tag, chunk_tag) in zip(words, pos_tags, chunk_tags):
-            if chunk_tag == "O":
-                state, chunk_type = "O", ""
+            if chunk_tag == 'O':
+                state, chunk_type = 'O', ''
             else:
-                (state, chunk_type) = chunk_tag.split("-")
+                (state, chunk_type) = chunk_tag.split('-')
             # If it's a chunk we don't care about, treat it as O.
             if chunk_types is not None and chunk_type not in chunk_types:
-                state = "O"
+                state = 'O'
             # Treat a mismatching I like a B.
-            if state == "I" and chunk_type != stack[-1].label():
-                state = "B"
+            if state == 'I' and chunk_type != stack[-1].label():
+                state = 'B'
             # For B or I: close any open chunks
-            if state in "BO" and len(stack) == 2:
+            if state in 'BO' and len(stack) == 2:
                 stack.pop()
             # For B: start a new chunk.
-            if state == "B":
+            if state == 'B':
                 new_chunk = Tree(chunk_type, [])
                 stack[-1].append(new_chunk)
                 stack.append(new_chunk)
@@ -302,29 +305,29 @@ class ConllCorpusReader(CorpusReader):
         return stack[0]
 
     def _get_parsed_sent(self, grid, pos_in_tree, tagset=None):
-        words = self._get_column(grid, self._colmap["words"])
-        pos_tags = self._get_column(grid, self._colmap["pos"])
+        words = self._get_column(grid, self._colmap['words'])
+        pos_tags = self._get_column(grid, self._colmap['pos'])
         if tagset and tagset != self._tagset:
             pos_tags = [map_tag(self._tagset, tagset, t) for t in pos_tags]
-        parse_tags = self._get_column(grid, self._colmap["tree"])
+        parse_tags = self._get_column(grid, self._colmap['tree'])
 
-        treestr = ""
+        treestr = ''
         for (word, pos_tag, parse_tag) in zip(words, pos_tags, parse_tags):
-            if word == "(":
-                word = "-LRB-"
-            if word == ")":
-                word = "-RRB-"
-            if pos_tag == "(":
-                pos_tag = "-LRB-"
-            if pos_tag == ")":
-                pos_tag = "-RRB-"
-            (left, right) = parse_tag.split("*")
-            right = right.count(")") * ")"  # only keep ')'.
-            treestr += "%s (%s %s) %s" % (left, pos_tag, word, right)
+            if word == '(':
+                word = '-LRB-'
+            if word == ')':
+                word = '-RRB-'
+            if pos_tag == '(':
+                pos_tag = '-LRB-'
+            if pos_tag == ')':
+                pos_tag = '-RRB-'
+            (left, right) = parse_tag.split('*')
+            right = right.count(')') * ')'  # only keep ')'.
+            treestr += '%s (%s %s) %s' % (left, pos_tag, word, right)
         try:
             tree = self._tree_class.fromstring(treestr)
         except (ValueError, IndexError):
-            tree = self._tree_class.fromstring("(%s %s)" % (self._root_label, treestr))
+            tree = self._tree_class.fromstring('(%s %s)' % (self._root_label, treestr))
 
         if not pos_in_tree:
             for subtree in tree.subtrees():
@@ -332,7 +335,7 @@ class ConllCorpusReader(CorpusReader):
                     if (
                         isinstance(child, Tree)
                         and len(child) == 1
-                        and isinstance(child[0], str)
+                        and isinstance(child[0], string_types)
                     ):
                         subtree[i] = (child[0], child.label())
 
@@ -343,15 +346,15 @@ class ConllCorpusReader(CorpusReader):
         list of list of (start, end), tag) tuples
         """
         if self._srl_includes_roleset:
-            predicates = self._get_column(grid, self._colmap["srl"] + 1)
-            start_col = self._colmap["srl"] + 2
+            predicates = self._get_column(grid, self._colmap['srl'] + 1)
+            start_col = self._colmap['srl'] + 2
         else:
-            predicates = self._get_column(grid, self._colmap["srl"])
-            start_col = self._colmap["srl"] + 1
+            predicates = self._get_column(grid, self._colmap['srl'])
+            start_col = self._colmap['srl'] + 1
 
         # Count how many predicates there are.  This tells us how many
         # columns to expect for SRL data.
-        num_preds = len([p for p in predicates if p != "-"])
+        num_preds = len([p for p in predicates if p != '-'])
 
         spanlists = []
         for i in range(num_preds):
@@ -359,11 +362,11 @@ class ConllCorpusReader(CorpusReader):
             spanlist = []
             stack = []
             for wordnum, srl_tag in enumerate(col):
-                (left, right) = srl_tag.split("*")
-                for tag in left.split("("):
+                (left, right) = srl_tag.split('*')
+                for tag in left.split('('):
                     if tag:
                         stack.append((tag, wordnum))
-                for i in range(right.count(")")):
+                for i in range(right.count(')')):
                     (tag, start) = stack.pop()
                     spanlist.append(((start, wordnum + 1), tag))
             spanlists.append(spanlist)
@@ -374,28 +377,28 @@ class ConllCorpusReader(CorpusReader):
         tree = self._get_parsed_sent(grid, pos_in_tree)
         spanlists = self._get_srl_spans(grid)
         if self._srl_includes_roleset:
-            predicates = self._get_column(grid, self._colmap["srl"] + 1)
-            rolesets = self._get_column(grid, self._colmap["srl"])
+            predicates = self._get_column(grid, self._colmap['srl'] + 1)
+            rolesets = self._get_column(grid, self._colmap['srl'])
         else:
-            predicates = self._get_column(grid, self._colmap["srl"])
+            predicates = self._get_column(grid, self._colmap['srl'])
             rolesets = [None] * len(predicates)
 
         instances = ConllSRLInstanceList(tree)
         for wordnum, predicate in enumerate(predicates):
-            if predicate == "-":
+            if predicate == '-':
                 continue
             # Decide which spanlist to use.  Don't assume that they're
             # sorted in the same order as the predicates (even though
             # they usually are).
             for spanlist in spanlists:
                 for (start, end), tag in spanlist:
-                    if wordnum in range(start, end) and tag in ("V", "C-V"):
+                    if wordnum in range(start, end) and tag in ('V', 'C-V'):
                         break
                 else:
                     continue
                 break
             else:
-                raise ValueError("No srl column found for %r" % predicate)
+                raise ValueError('No srl column found for %r' % predicate)
             instances.append(
                 ConllSRLInstance(tree, wordnum, predicate, rolesets[wordnum], spanlist)
             )
@@ -410,7 +413,7 @@ class ConllCorpusReader(CorpusReader):
         for columntype in columntypes:
             if columntype not in self._colmap:
                 raise ValueError(
-                    "This corpus does not contain a %s " "column." % columntype
+                    'This corpus does not contain a %s ' 'column.' % columntype
                 )
 
     @staticmethod
@@ -418,6 +421,7 @@ class ConllCorpusReader(CorpusReader):
         return [grid[i][column_index] for i in range(len(grid))]
 
 
+@compat.python_2_unicode_compatible
 class ConllSRLInstance(object):
     """
     An SRL instance from a CoNLL corpus, which identifies and
@@ -463,7 +467,7 @@ class ConllSRLInstance(object):
 
         # Fill in the self.verb and self.arguments values.
         for (start, end), tag in tagged_spans:
-            if tag in ("V", "C-V"):
+            if tag in ('V', 'C-V'):
                 self.verb += list(range(start, end))
             else:
                 self.arguments.append(((start, end), tag))
@@ -471,31 +475,32 @@ class ConllSRLInstance(object):
     def __repr__(self):
         # Originally, its:
         ##plural = 's' if len(self.arguments) != 1 else ''
-        plural = "s" if len(self.arguments) != 1 else ""
-        return "<ConllSRLInstance for %r with %d argument%s>" % (
+        plural = 's' if len(self.arguments) != 1 else ''
+        return '<ConllSRLInstance for %r with %d argument%s>' % (
             (self.verb_stem, len(self.arguments), plural)
         )
 
     def pprint(self):
-        verbstr = " ".join(self.words[i][0] for i in self.verb)
-        hdr = "SRL for %r (stem=%r):\n" % (verbstr, self.verb_stem)
-        s = ""
+        verbstr = ' '.join(self.words[i][0] for i in self.verb)
+        hdr = 'SRL for %r (stem=%r):\n' % (verbstr, self.verb_stem)
+        s = ''
         for i, word in enumerate(self.words):
             if isinstance(word, tuple):
                 word = word[0]
             for (start, end), argid in self.arguments:
                 if i == start:
-                    s += "[%s " % argid
+                    s += '[%s ' % argid
                 if i == end:
-                    s += "] "
+                    s += '] '
             if i in self.verb:
-                word = "<<%s>>" % word
-            s += word + " "
+                word = '<<%s>>' % word
+            s += word + ' '
         return hdr + textwrap.fill(
-            s.replace(" ]", "]"), initial_indent="    ", subsequent_indent="    "
+            s.replace(' ]', ']'), initial_indent='    ', subsequent_indent='    '
         )
 
 
+@compat.python_2_unicode_compatible
 class ConllSRLInstanceList(list):
     """
     Set of instances for a single sentence
@@ -512,45 +517,45 @@ class ConllSRLInstanceList(list):
         # Sanity check: trees should be the same
         for inst in self:
             if inst.tree != self.tree:
-                raise ValueError("Tree mismatch!")
+                raise ValueError('Tree mismatch!')
 
         # If desired, add trees:
         if include_tree:
             words = self.tree.leaves()
             pos = [None] * len(words)
-            synt = ["*"] * len(words)
+            synt = ['*'] * len(words)
             self._tree2conll(self.tree, 0, words, pos, synt)
 
-        s = ""
+        s = ''
         for i in range(len(words)):
             # optional tree columns
             if include_tree:
-                s += "%-20s " % words[i]
-                s += "%-8s " % pos[i]
-                s += "%15s*%-8s " % tuple(synt[i].split("*"))
+                s += '%-20s ' % words[i]
+                s += '%-8s ' % pos[i]
+                s += '%15s*%-8s ' % tuple(synt[i].split('*'))
 
             # verb head column
             for inst in self:
                 if i == inst.verb_head:
-                    s += "%-20s " % inst.verb_stem
+                    s += '%-20s ' % inst.verb_stem
                     break
             else:
-                s += "%-20s " % "-"
+                s += '%-20s ' % '-'
             # Remaining columns: self
             for inst in self:
-                argstr = "*"
+                argstr = '*'
                 for (start, end), argid in inst.tagged_spans:
                     if i == start:
-                        argstr = "(%s%s" % (argid, argstr)
+                        argstr = '(%s%s' % (argid, argstr)
                     if i == (end - 1):
-                        argstr += ")"
-                s += "%-12s " % argstr
-            s += "\n"
+                        argstr += ')'
+                s += '%-12s ' % argstr
+            s += '\n'
         return s
 
     def _tree2conll(self, tree, wordnum, words, pos, synt):
         assert isinstance(tree, Tree)
-        if len(tree) == 1 and isinstance(tree[0], str):
+        if len(tree) == 1 and isinstance(tree[0], string_types):
             pos[wordnum] = tree.label()
             assert words[wordnum] == tree[0]
             return wordnum + 1
@@ -559,10 +564,10 @@ class ConllSRLInstanceList(list):
             pos[wordnum], pos[wordnum] = tree[0]
             return wordnum + 1
         else:
-            synt[wordnum] = "(%s%s" % (tree.label(), synt[wordnum])
+            synt[wordnum] = '(%s%s' % (tree.label(), synt[wordnum])
             for child in tree:
                 wordnum = self._tree2conll(child, wordnum, words, pos, synt)
-            synt[wordnum - 1] += ")"
+            synt[wordnum - 1] += ')'
             return wordnum
 
 
@@ -573,13 +578,13 @@ class ConllChunkCorpusReader(ConllCorpusReader):
     """
 
     def __init__(
-        self, root, fileids, chunk_types, encoding="utf8", tagset=None, separator=None
+        self, root, fileids, chunk_types, encoding='utf8', tagset=None, separator=None
     ):
         ConllCorpusReader.__init__(
             self,
             root,
             fileids,
-            ("words", "pos", "chunk"),
+            ('words', 'pos', 'chunk'),
             chunk_types=chunk_types,
             encoding=encoding,
             tagset=tagset,
index 1831236..8470b06 100644 (file)
@@ -1,7 +1,7 @@
 # -*- coding: utf-8 -*-
 # Natural Language Toolkit: An Crubadan N-grams Reader
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Avital Pekker <avital.pekker@utoronto.ca>
 #
 # URL: <http://nltk.org/>
@@ -19,9 +19,12 @@ For details about An Crubadan, this data, and its potential uses, see:
 http://borel.slu.edu/crubadan/index.html
 """
 
+from __future__ import print_function, unicode_literals
+
 import re
 from os import path
 
+from nltk.compat import PY3
 from nltk.corpus.reader import CorpusReader
 from nltk.probability import FreqDist
 from nltk.data import ZipFilePathPointer
@@ -32,17 +35,17 @@ class CrubadanCorpusReader(CorpusReader):
     A corpus reader used to access language An Crubadan n-gram files.
     """
 
-    _LANG_MAPPER_FILE = "table.txt"
+    _LANG_MAPPER_FILE = 'table.txt'
     _all_lang_freq = {}
 
-    def __init__(self, root, fileids, encoding="utf8", tagset=None):
-        super(CrubadanCorpusReader, self).__init__(root, fileids, encoding="utf8")
+    def __init__(self, root, fileids, encoding='utf8', tagset=None):
+        super(CrubadanCorpusReader, self).__init__(root, fileids, encoding='utf8')
         self._lang_mapping_data = []
         self._load_lang_mapping_data()
 
     def lang_freq(self, lang):
-        """ Return n-gram FreqDist for a specific language
-            given ISO 639-3 language code """
+        ''' Return n-gram FreqDist for a specific language
+            given ISO 639-3 language code '''
 
         if lang not in self._all_lang_freq:
             self._all_lang_freq[lang] = self._load_lang_ngrams(lang)
@@ -50,23 +53,23 @@ class CrubadanCorpusReader(CorpusReader):
         return self._all_lang_freq[lang]
 
     def langs(self):
-        """ Return a list of supported languages as ISO 639-3 codes """
+        ''' Return a list of supported languages as ISO 639-3 codes '''
         return [row[1] for row in self._lang_mapping_data]
 
     def iso_to_crubadan(self, lang):
-        """ Return internal Crubadan code based on ISO 639-3 code """
+        ''' Return internal Crubadan code based on ISO 639-3 code '''
         for i in self._lang_mapping_data:
             if i[1].lower() == lang.lower():
                 return i[0]
 
     def crubadan_to_iso(self, lang):
-        """ Return ISO 639-3 code given internal Crubadan code """
+        ''' Return ISO 639-3 code given internal Crubadan code '''
         for i in self._lang_mapping_data:
             if i[0].lower() == lang.lower():
                 return i[1]
 
     def _load_lang_mapping_data(self):
-        """ Load language mappings between codes and description from table.txt """
+        ''' Load language mappings between codes and description from table.txt '''
         if isinstance(self.root, ZipFilePathPointer):
             raise RuntimeError(
                 "Please install the 'crubadan' corpus first, use nltk.download()"
@@ -76,30 +79,39 @@ class CrubadanCorpusReader(CorpusReader):
         if self._LANG_MAPPER_FILE not in self.fileids():
             raise RuntimeError("Could not find language mapper file: " + mapper_file)
 
-        raw = open(mapper_file, "r", encoding="utf-8").read().strip()
+        if PY3:
+            raw = open(mapper_file, 'r', encoding='utf-8').read().strip()
+        else:
+            raw = open(mapper_file, 'rU').read().decode('utf-8').strip()
 
-        self._lang_mapping_data = [row.split("\t") for row in raw.split("\n")]
+        self._lang_mapping_data = [row.split('\t') for row in raw.split('\n')]
 
     def _load_lang_ngrams(self, lang):
-        """ Load single n-gram language file given the ISO 639-3 language code
-            and return its FreqDist """
+        ''' Load single n-gram language file given the ISO 639-3 language code
+            and return its FreqDist '''
 
         if lang not in self.langs():
             raise RuntimeError("Unsupported language.")
 
         crubadan_code = self.iso_to_crubadan(lang)
-        ngram_file = path.join(self.root, crubadan_code + "-3grams.txt")
+        ngram_file = path.join(self.root, crubadan_code + '-3grams.txt')
 
         if not path.isfile(ngram_file):
             raise RuntimeError("No N-gram file found for requested language.")
 
         counts = FreqDist()
-        f = open(ngram_file, "r", encoding="utf-8")
+        if PY3:
+            f = open(ngram_file, 'r', encoding='utf-8')
+        else:
+            f = open(ngram_file, 'rU')
 
         for line in f:
-            data = line.split(" ")
+            if PY3:
+                data = line.split(' ')
+            else:
+                data = line.decode('utf8').split(' ')
 
-            ngram = data[1].strip("\n")
+            ngram = data[1].strip('\n')
             freq = int(data[0])
 
             counts[ngram] = freq
index 4314fbd..49e7423 100644 (file)
@@ -1,6 +1,6 @@
 # Natural Language Toolkit: Dependency Corpus Reader
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Kepa Sarasola <kepa.sarasola@ehu.es>
 #         Iker Manterola <returntothehangar@hotmail.com>
 #
@@ -21,9 +21,9 @@ class DependencyCorpusReader(SyntaxCorpusReader):
         self,
         root,
         fileids,
-        encoding="utf8",
+        encoding='utf8',
         word_tokenizer=TabTokenizer(),
-        sent_tokenizer=RegexpTokenizer("\n", gaps=True),
+        sent_tokenizer=RegexpTokenizer('\n', gaps=True),
         para_block_reader=read_blankline_block,
     ):
         # FIXME: Why is it inheritting from SyntaxCorpusReader but initializing
@@ -89,7 +89,7 @@ class DependencyCorpusReader(SyntaxCorpusReader):
 
 
 class DependencyCorpusView(StreamBackedCorpusView):
-    _DOCSTART = "-DOCSTART- -DOCSTART- O\n"  # dokumentu hasiera definitzen da
+    _DOCSTART = '-DOCSTART- -DOCSTART- O\n'  # dokumentu hasiera definitzen da
 
     def __init__(
         self,
@@ -98,7 +98,7 @@ class DependencyCorpusView(StreamBackedCorpusView):
         group_by_sent,
         dependencies,
         chunk_types=None,
-        encoding="utf8",
+        encoding='utf8',
     ):
         self._tagged = tagged
         self._dependencies = dependencies
@@ -115,13 +115,13 @@ class DependencyCorpusView(StreamBackedCorpusView):
 
         # extract word and tag from any of the formats
         if not self._dependencies:
-            lines = [line.split("\t") for line in sent.split("\n")]
+            lines = [line.split('\t') for line in sent.split('\n')]
             if len(lines[0]) == 3 or len(lines[0]) == 4:
                 sent = [(line[0], line[1]) for line in lines]
             elif len(lines[0]) == 10:
                 sent = [(line[1], line[4]) for line in lines]
             else:
-                raise ValueError("Unexpected number of fields in dependency tree file")
+                raise ValueError('Unexpected number of fields in dependency tree file')
 
             # discard tags if they weren't requested
             if not self._tagged:
index 4eaa6d1..9705f4a 100644 (file)
@@ -1,6 +1,6 @@
 # Natural Language Toolkit: Framenet Corpus Reader
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Authors: Chuck Wooters <wooters@icsi.berkeley.edu>,
 #          Nathan Schneider <nathan.schneider@georgetown.edu>
 # URL: <http://nltk.org/>
@@ -10,6 +10,7 @@
 """
 Corpus reader for the FrameNet 1.7 lexicon and corpus.
 """
+from __future__ import print_function, unicode_literals
 
 import os
 import re
@@ -19,15 +20,16 @@ import sys
 import types
 from collections import defaultdict, OrderedDict
 from operator import itemgetter
-from itertools import zip_longest
 
+from six import string_types, text_type
+from six.moves import zip_longest
 from pprint import pprint
 
 from nltk.corpus.reader import XMLCorpusReader, XMLCorpusView
-
+from nltk.compat import python_2_unicode_compatible
 from nltk.util import LazyConcatenation, LazyMap, LazyIteratorList
 
-__docformat__ = "epytext en"
+__docformat__ = 'epytext en'
 
 
 def mimic_wrap(lines, wrap_at=65, **kwargs):
@@ -35,7 +37,7 @@ def mimic_wrap(lines, wrap_at=65, **kwargs):
     Wrap the first of 'lines' with textwrap and the remaining lines at exactly the same
     positions as the first.
     """
-    l0 = textwrap.fill(lines[0], wrap_at, drop_whitespace=False).split("\n")
+    l0 = textwrap.fill(lines[0], wrap_at, drop_whitespace=False).split('\n')
     yield l0
 
     def _(line):
@@ -46,14 +48,14 @@ def mimic_wrap(lines, wrap_at=65, **kwargs):
             il0 += 1
         if line:  # Remaining stuff on this line past the end of the mimicked line.
             # So just textwrap this line.
-            for ln in textwrap.fill(line, wrap_at, drop_whitespace=False).split("\n"):
+            for ln in textwrap.fill(line, wrap_at, drop_whitespace=False).split('\n'):
                 yield ln
 
     for l in lines[1:]:
         yield list(_(l))
 
 
-def _pretty_longstring(defstr, prefix="", wrap_at=65):
+def _pretty_longstring(defstr, prefix='', wrap_at=65):
 
     """
     Helper function for pretty-printing a long string.
@@ -65,8 +67,8 @@ def _pretty_longstring(defstr, prefix="", wrap_at=65):
     """
 
     outstr = ""
-    for line in textwrap.fill(defstr, wrap_at).split("\n"):
-        outstr += prefix + line + "\n"
+    for line in textwrap.fill(defstr, wrap_at).split('\n'):
+        outstr += prefix + line + '\n'
     return outstr
 
 
@@ -83,10 +85,10 @@ def _pretty_any(obj):
 
     outstr = ""
     for k in obj:
-        if isinstance(obj[k], str) and len(obj[k]) > 65:
+        if isinstance(obj[k], string_types) and len(obj[k]) > 65:
             outstr += "[{0}]\n".format(k)
-            outstr += "{0}".format(_pretty_longstring(obj[k], prefix="  "))
-            outstr += "\n"
+            outstr += "{0}".format(_pretty_longstring(obj[k], prefix='  '))
+            outstr += '\n'
         else:
             outstr += "[{0}] {1}\n".format(k, obj[k])
 
@@ -110,11 +112,11 @@ def _pretty_semtype(st):
 
     outstr = ""
     outstr += "semantic type ({0.ID}): {0.name}\n".format(st)
-    if "abbrev" in semkeys:
+    if 'abbrev' in semkeys:
         outstr += "[abbrev] {0}\n".format(st.abbrev)
-    if "definition" in semkeys:
+    if 'definition' in semkeys:
         outstr += "[definition]\n"
-        outstr += _pretty_longstring(st.definition, "  ")
+        outstr += _pretty_longstring(st.definition, '  ')
     outstr += "[rootType] {0}({1})\n".format(st.rootType.name, st.rootType.ID)
     if st.superType is None:
         outstr += "[superType] <None>\n"
@@ -123,8 +125,8 @@ def _pretty_semtype(st):
     outstr += "[subTypes] {0} subtypes\n".format(len(st.subTypes))
     outstr += (
         "  "
-        + ", ".join("{0}({1})".format(x.name, x.ID) for x in st.subTypes)
-        + "\n" * (len(st.subTypes) > 0)
+        + ", ".join('{0}({1})'.format(x.name, x.ID) for x in st.subTypes)
+        + '\n' * (len(st.subTypes) > 0)
     )
     return outstr
 
@@ -191,40 +193,40 @@ def _pretty_lu(lu):
     lukeys = lu.keys()
     outstr = ""
     outstr += "lexical unit ({0.ID}): {0.name}\n\n".format(lu)
-    if "definition" in lukeys:
+    if 'definition' in lukeys:
         outstr += "[definition]\n"
-        outstr += _pretty_longstring(lu.definition, "  ")
-    if "frame" in lukeys:
+        outstr += _pretty_longstring(lu.definition, '  ')
+    if 'frame' in lukeys:
         outstr += "\n[frame] {0}({1})\n".format(lu.frame.name, lu.frame.ID)
-    if "incorporatedFE" in lukeys:
+    if 'incorporatedFE' in lukeys:
         outstr += "\n[incorporatedFE] {0}\n".format(lu.incorporatedFE)
-    if "POS" in lukeys:
+    if 'POS' in lukeys:
         outstr += "\n[POS] {0}\n".format(lu.POS)
-    if "status" in lukeys:
+    if 'status' in lukeys:
         outstr += "\n[status] {0}\n".format(lu.status)
-    if "totalAnnotated" in lukeys:
+    if 'totalAnnotated' in lukeys:
         outstr += "\n[totalAnnotated] {0} annotated examples\n".format(
             lu.totalAnnotated
         )
-    if "lexemes" in lukeys:
+    if 'lexemes' in lukeys:
         outstr += "\n[lexemes] {0}\n".format(
-            " ".join("{0}/{1}".format(lex.name, lex.POS) for lex in lu.lexemes)
+            ' '.join('{0}/{1}'.format(lex.name, lex.POS) for lex in lu.lexemes)
         )
-    if "semTypes" in lukeys:
+    if 'semTypes' in lukeys:
         outstr += "\n[semTypes] {0} semantic types\n".format(len(lu.semTypes))
         outstr += (
             "  " * (len(lu.semTypes) > 0)
-            + ", ".join("{0}({1})".format(x.name, x.ID) for x in lu.semTypes)
-            + "\n" * (len(lu.semTypes) > 0)
+            + ", ".join('{0}({1})'.format(x.name, x.ID) for x in lu.semTypes)
+            + '\n' * (len(lu.semTypes) > 0)
         )
-    if "URL" in lukeys:
+    if 'URL' in lukeys:
         outstr += "\n[URL] {0}\n".format(lu.URL)
-    if "subCorpus" in lukeys:
+    if 'subCorpus' in lukeys:
         subc = [x.name for x in lu.subCorpus]
         outstr += "\n[subCorpus] {0} subcorpora\n".format(len(lu.subCorpus))
-        for line in textwrap.fill(", ".join(sorted(subc)), 60).split("\n"):
+        for line in textwrap.fill(", ".join(sorted(subc)), 60).split('\n'):
             outstr += "  {0}\n".format(line)
-    if "exemplars" in lukeys:
+    if 'exemplars' in lukeys:
         outstr += "\n[exemplars] {0} sentences across all subcorpora\n".format(
             len(lu.exemplars)
         )
@@ -284,7 +286,7 @@ def _pretty_fulltext_sentence(sent):
 
     outstr = ""
     outstr += "full-text sentence ({0.ID}) in {1}:\n\n".format(
-        sent, sent.doc.get("name", sent.doc.description)
+        sent, sent.doc.get('name', sent.doc.description)
     )
     outstr += "\n[POS] {0} tags\n".format(len(sent.POS))
     outstr += "\n[POS_tagset] {0}\n\n".format(sent.POS_tagset)
@@ -314,29 +316,29 @@ def _pretty_pos(aset):
 
     sent = aset.sent
     s0 = sent.text
-    s1 = ""
-    s2 = ""
+    s1 = ''
+    s2 = ''
     i = 0
     adjust = 0
     for j, k, lbl in overt:
-        assert j >= i, ("Overlapping targets?", (j, k, lbl))
-        s1 += " " * (j - i) + "-" * (k - j)
+        assert j >= i, ('Overlapping targets?', (j, k, lbl))
+        s1 += ' ' * (j - i) + '-' * (k - j)
         if len(lbl) > (k - j):
             # add space in the sentence to make room for the annotation index
             amt = len(lbl) - (k - j)
             s0 = (
-                s0[: k + adjust] + "~" * amt + s0[k + adjust :]
+                s0[: k + adjust] + '~' * amt + s0[k + adjust :]
             )  # '~' to prevent line wrapping
-            s1 = s1[: k + adjust] + " " * amt + s1[k + adjust :]
+            s1 = s1[: k + adjust] + ' ' * amt + s1[k + adjust :]
             adjust += amt
-        s2 += " " * (j - i) + lbl.ljust(k - j)
+        s2 += ' ' * (j - i) + lbl.ljust(k - j)
         i = k
 
     long_lines = [s0, s1, s2]
 
-    outstr += "\n\n".join(
-        map("\n".join, zip_longest(*mimic_wrap(long_lines), fillvalue=" "))
-    ).replace("~", " ")
+    outstr += '\n\n'.join(
+        map('\n'.join, zip_longest(*mimic_wrap(long_lines), fillvalue=' '))
+    ).replace('~', ' ')
     outstr += "\n"
     return outstr
 
@@ -358,13 +360,13 @@ def _pretty_annotation(sent, aset_level=False):
     outstr += " ({0.ID}):\n".format(sent)
     if aset_level:  # TODO: any UNANN exemplars?
         outstr += "\n[status] {0}\n".format(sent.status)
-    for k in ("corpID", "docID", "paragNo", "sentNo", "aPos"):
+    for k in ('corpID', 'docID', 'paragNo', 'sentNo', 'aPos'):
         if k in sentkeys:
             outstr += "[{0}] {1}\n".format(k, sent[k])
     outstr += (
         "\n[LU] ({0.ID}) {0.name} in {0.frame.name}\n".format(sent.LU)
         if sent.LU
-        else "\n[LU] Not found!"
+        else '\n[LU] Not found!'
     )
     outstr += "\n[frame] ({0.ID}) {0.name}\n".format(
         sent.frame
@@ -420,7 +422,7 @@ def _pretty_annotation(sent, aset_level=False):
     - Scon: (none)
     - Art: (none)
     """
-    for lyr in ("NER", "WSL", "Other", "Sent"):
+    for lyr in ('NER', 'WSL', 'Other', 'Sent'):
         if lyr in sent and sent[lyr]:
             outstr += "\n[{0}] {1} entr{2}\n".format(
                 lyr, len(sent[lyr]), "ies" if len(sent[lyr]) != 1 else "y"
@@ -428,12 +430,12 @@ def _pretty_annotation(sent, aset_level=False):
     outstr += "\n[text] + [Target] + [FE]"
     # POS-specific layers: syntactically important words that are neither the target
     # nor the FEs. Include these along with the first FE layer but with '^' underlining.
-    for lyr in ("Verb", "Noun", "Adj", "Adv", "Prep", "Scon", "Art"):
+    for lyr in ('Verb', 'Noun', 'Adj', 'Adv', 'Prep', 'Scon', 'Art'):
         if lyr in sent and sent[lyr]:
             outstr += " + [{0}]".format(lyr)
-    if "FE2" in sentkeys:
+    if 'FE2' in sentkeys:
         outstr += " + [FE2]"
-        if "FE3" in sentkeys:
+        if 'FE3' in sentkeys:
             outstr += " + [FE3]"
     outstr += "\n\n"
     outstr += sent._ascii()  # -> _annotation_ascii()
@@ -443,15 +445,15 @@ def _pretty_annotation(sent, aset_level=False):
 
 
 def _annotation_ascii(sent):
-    """
+    '''
     Given a sentence or FE annotation set, construct the width-limited string showing
     an ASCII visualization of the sentence's annotations, calling either
     _annotation_ascii_frames() or _annotation_ascii_FEs() as appropriate.
     This will be attached as a method to appropriate AttrDict instances
     and called in the full pretty-printing of the instance.
-    """
-    if sent._type == "fulltext_sentence" or (
-        "annotationSet" in sent and len(sent.annotationSet) > 2
+    '''
+    if sent._type == 'fulltext_sentence' or (
+        'annotationSet' in sent and len(sent.annotationSet) > 2
     ):
         # a full-text sentence OR sentence with multiple targets.
         # (multiple targets = >2 annotation sets, because the first annotation set is POS.)
@@ -461,24 +463,24 @@ def _annotation_ascii(sent):
 
 
 def _annotation_ascii_frames(sent):
-    """
+    '''
     ASCII string rendering of the sentence along with its targets and frame names.
     Called for all full-text sentences, as well as the few LU sentences with multiple
     targets (e.g., fn.lu(6412).exemplars[82] has two want.v targets).
     Line-wrapped to limit the display width.
-    """
+    '''
     # list the target spans and their associated aset index
     overt = []
     for a, aset in enumerate(sent.annotationSet[1:]):
         for j, k in aset.Target:
             indexS = "[{0}]".format(a + 1)
-            if aset.status == "UNANN" or aset.LU.status == "Problem":
+            if aset.status == 'UNANN' or aset.LU.status == 'Problem':
                 indexS += " "
-                if aset.status == "UNANN":
+                if aset.status == 'UNANN':
                     indexS += (
                         "!"
                     )  # warning indicator that there is a frame annotation but no FE annotation
-                if aset.LU.status == "Problem":
+                if aset.LU.status == 'Problem':
                     indexS += (
                         "?"
                     )  # warning indicator that there is a missing LU definition (because the LU has Problem status)
@@ -497,37 +499,37 @@ def _annotation_ascii_frames(sent):
                 combinedIndex = (
                     overt[o - 1][3] + asetIndex
                 )  # e.g., '[1][2]', '[1]! [2]'
-                combinedIndex = combinedIndex.replace(" !", "! ").replace(" ?", "? ")
+                combinedIndex = combinedIndex.replace(' !', '! ').replace(' ?', '? ')
                 overt[o - 1] = overt[o - 1][:3] + (combinedIndex,)
                 duplicates.add(o)
             else:  # different frames, same or overlapping targets
                 s = sent.text
                 for j, k, fname, asetIndex in overt:
-                    s += "\n" + asetIndex + " " + sent.text[j:k] + " :: " + fname
-                s += "\n(Unable to display sentence with targets marked inline due to overlap)"
+                    s += '\n' + asetIndex + ' ' + sent.text[j:k] + ' :: ' + fname
+                s += '\n(Unable to display sentence with targets marked inline due to overlap)'
                 return s
     for o in reversed(sorted(duplicates)):
         del overt[o]
 
     s0 = sent.text
-    s1 = ""
-    s11 = ""
-    s2 = ""
+    s1 = ''
+    s11 = ''
+    s2 = ''
     i = 0
     adjust = 0
     fAbbrevs = OrderedDict()
     for j, k, fname, asetIndex in overt:
         if not j >= i:
             assert j >= i, (
-                "Overlapping targets?"
+                'Overlapping targets?'
                 + (
-                    " UNANN"
-                    if any(aset.status == "UNANN" for aset in sent.annotationSet[1:])
-                    else ""
+                    ' UNANN'
+                    if any(aset.status == 'UNANN' for aset in sent.annotationSet[1:])
+                    else ''
                 ),
                 (j, k, asetIndex),
             )
-        s1 += " " * (j - i) + "*" * (k - j)
+        s1 += ' ' * (j - i) + '*' * (k - j)
         short = fname[: k - j]
         if (k - j) < len(fname):
             r = 0
@@ -538,39 +540,39 @@ def _annotation_ascii_frames(sent):
                 short = fname[: k - j - 1] + str(r)
             else:  # short not in fAbbrevs
                 fAbbrevs[short] = fname
-        s11 += " " * (j - i) + short.ljust(k - j)
+        s11 += ' ' * (j - i) + short.ljust(k - j)
         if len(asetIndex) > (k - j):
             # add space in the sentence to make room for the annotation index
             amt = len(asetIndex) - (k - j)
             s0 = (
-                s0[: k + adjust] + "~" * amt + s0[k + adjust :]
+                s0[: k + adjust] + '~' * amt + s0[k + adjust :]
             )  # '~' to prevent line wrapping
-            s1 = s1[: k + adjust] + " " * amt + s1[k + adjust :]
-            s11 = s11[: k + adjust] + " " * amt + s11[k + adjust :]
+            s1 = s1[: k + adjust] + ' ' * amt + s1[k + adjust :]
+            s11 = s11[: k + adjust] + ' ' * amt + s11[k + adjust :]
             adjust += amt
-        s2 += " " * (j - i) + asetIndex.ljust(k - j)
+        s2 += ' ' * (j - i) + asetIndex.ljust(k - j)
         i = k
 
     long_lines = [s0, s1, s11, s2]
 
-    outstr = "\n\n".join(
-        map("\n".join, zip_longest(*mimic_wrap(long_lines), fillvalue=" "))
-    ).replace("~", " ")
-    outstr += "\n"
+    outstr = '\n\n'.join(
+        map('\n'.join, zip_longest(*mimic_wrap(long_lines), fillvalue=' '))
+    ).replace('~', ' ')
+    outstr += '\n'
     if fAbbrevs:
-        outstr += " (" + ", ".join("=".join(pair) for pair in fAbbrevs.items()) + ")"
-        assert len(fAbbrevs) == len(dict(fAbbrevs)), "Abbreviation clash"
+        outstr += ' (' + ', '.join('='.join(pair) for pair in fAbbrevs.items()) + ')'
+        assert len(fAbbrevs) == len(dict(fAbbrevs)), 'Abbreviation clash'
 
     return outstr
 
 
 def _annotation_ascii_FE_layer(overt, ni, feAbbrevs):
-    """Helper for _annotation_ascii_FEs()."""
-    s1 = ""
-    s2 = ""
+    '''Helper for _annotation_ascii_FEs().'''
+    s1 = ''
+    s2 = ''
     i = 0
     for j, k, fename in overt:
-        s1 += " " * (j - i) + ("^" if fename.islower() else "-") * (k - j)
+        s1 += ' ' * (j - i) + ('^' if fename.islower() else '-') * (k - j)
         short = fename[: k - j]
         if len(fename) > len(short):
             r = 0
@@ -581,30 +583,30 @@ def _annotation_ascii_FE_layer(overt, ni, feAbbrevs):
                 short = fename[: k - j - 1] + str(r)
             else:  # short not in feAbbrevs
                 feAbbrevs[short] = fename
-        s2 += " " * (j - i) + short.ljust(k - j)
+        s2 += ' ' * (j - i) + short.ljust(k - j)
         i = k
 
-    sNI = ""
+    sNI = ''
     if ni:
-        sNI += " [" + ", ".join(":".join(x) for x in sorted(ni.items())) + "]"
+        sNI += ' [' + ', '.join(':'.join(x) for x in sorted(ni.items())) + ']'
     return [s1, s2, sNI]
 
 
 def _annotation_ascii_FEs(sent):
-    """
+    '''
     ASCII string rendering of the sentence along with a single target and its FEs.
     Secondary and tertiary FE layers are included if present.
     'sent' can be an FE annotation set or an LU sentence with a single target.
     Line-wrapped to limit the display width.
-    """
+    '''
     feAbbrevs = OrderedDict()
     posspec = []  # POS-specific layer spans (e.g., Supp[ort], Cop[ula])
     posspec_separate = False
-    for lyr in ("Verb", "Noun", "Adj", "Adv", "Prep", "Scon", "Art"):
+    for lyr in ('Verb', 'Noun', 'Adj', 'Adv', 'Prep', 'Scon', 'Art'):
         if lyr in sent and sent[lyr]:
             for a, b, lbl in sent[lyr]:
                 if (
-                    lbl == "X"
+                    lbl == 'X'
                 ):  # skip this, which covers an entire phrase typically containing the target and all its FEs
                     # (but do display the Gov)
                     continue
@@ -614,7 +616,7 @@ def _annotation_ascii_FEs(sent):
                         True
                     )  # show POS-specific layers on a separate line
                 posspec.append(
-                    (a, b, lbl.lower().replace("-", ""))
+                    (a, b, lbl.lower().replace('-', ''))
                 )  # lowercase Cop=>cop, Non-Asp=>nonasp, etc. to distinguish from FE names
     if posspec_separate:
         POSSPEC = _annotation_ascii_FE_layer(posspec, {}, feAbbrevs)
@@ -624,20 +626,20 @@ def _annotation_ascii_FEs(sent):
         feAbbrevs,
     )
     FE2 = FE3 = None
-    if "FE2" in sent:
+    if 'FE2' in sent:
         FE2 = _annotation_ascii_FE_layer(sent.FE2[0], sent.FE2[1], feAbbrevs)
-        if "FE3" in sent:
+        if 'FE3' in sent:
             FE3 = _annotation_ascii_FE_layer(sent.FE3[0], sent.FE3[1], feAbbrevs)
 
     for i, j in sent.Target:
         FE1span, FE1name, FE1exp = FE1
         if len(FE1span) < j:
-            FE1span += " " * (j - len(FE1span))
+            FE1span += ' ' * (j - len(FE1span))
         if len(FE1name) < j:
-            FE1name += " " * (j - len(FE1name))
+            FE1name += ' ' * (j - len(FE1name))
             FE1[1] = FE1name
         FE1[0] = (
-            FE1span[:i] + FE1span[i:j].replace(" ", "*").replace("-", "=") + FE1span[j:]
+            FE1span[:i] + FE1span[i:j].replace(' ', '*').replace('-', '=') + FE1span[j:]
         )
     long_lines = [sent.text]
     if posspec_separate:
@@ -647,13 +649,13 @@ def _annotation_ascii_FEs(sent):
         long_lines.extend([FE2[0], FE2[1] + FE2[2]])
         if FE3:
             long_lines.extend([FE3[0], FE3[1] + FE3[2]])
-    long_lines.append("")
-    outstr = "\n".join(
-        map("\n".join, zip_longest(*mimic_wrap(long_lines), fillvalue=" "))
+    long_lines.append('')
+    outstr = '\n'.join(
+        map('\n'.join, zip_longest(*mimic_wrap(long_lines), fillvalue=' '))
     )
     if feAbbrevs:
-        outstr += "(" + ", ".join("=".join(pair) for pair in feAbbrevs.items()) + ")"
-        assert len(feAbbrevs) == len(dict(feAbbrevs)), "Abbreviation clash"
+        outstr += '(' + ', '.join('='.join(pair) for pair in feAbbrevs.items()) + ')'
+        assert len(feAbbrevs) == len(dict(feAbbrevs)), 'Abbreviation clash'
     outstr += "\n"
 
     return outstr
@@ -674,31 +676,31 @@ def _pretty_fe(fe):
     outstr += "frame element ({0.ID}): {0.name}\n    of {1.name}({1.ID})\n".format(
         fe, fe.frame
     )
-    if "definition" in fekeys:
+    if 'definition' in fekeys:
         outstr += "[definition]\n"
-        outstr += _pretty_longstring(fe.definition, "  ")
-    if "abbrev" in fekeys:
+        outstr += _pretty_longstring(fe.definition, '  ')
+    if 'abbrev' in fekeys:
         outstr += "[abbrev] {0}\n".format(fe.abbrev)
-    if "coreType" in fekeys:
+    if 'coreType' in fekeys:
         outstr += "[coreType] {0}\n".format(fe.coreType)
-    if "requiresFE" in fekeys:
+    if 'requiresFE' in fekeys:
         outstr += "[requiresFE] "
         if fe.requiresFE is None:
             outstr += "<None>\n"
         else:
             outstr += "{0}({1})\n".format(fe.requiresFE.name, fe.requiresFE.ID)
-    if "excludesFE" in fekeys:
+    if 'excludesFE' in fekeys:
         outstr += "[excludesFE] "
         if fe.excludesFE is None:
             outstr += "<None>\n"
         else:
             outstr += "{0}({1})\n".format(fe.excludesFE.name, fe.excludesFE.ID)
-    if "semType" in fekeys:
+    if 'semType' in fekeys:
         outstr += "[semType] "
         if fe.semType is None:
             outstr += "<None>\n"
         else:
-            outstr += "\n  " + "{0}({1})".format(fe.semType.name, fe.semType.ID) + "\n"
+            outstr += "\n  " + "{0}({1})".format(fe.semType.name, fe.semType.ID) + '\n'
 
     return outstr
 
@@ -718,26 +720,26 @@ def _pretty_frame(frame):
     outstr += "frame ({0.ID}): {0.name}\n\n".format(frame)
     outstr += "[URL] {0}\n\n".format(frame.URL)
     outstr += "[definition]\n"
-    outstr += _pretty_longstring(frame.definition, "  ") + "\n"
+    outstr += _pretty_longstring(frame.definition, '  ') + '\n'
 
     outstr += "[semTypes] {0} semantic types\n".format(len(frame.semTypes))
     outstr += (
         "  " * (len(frame.semTypes) > 0)
         + ", ".join("{0}({1})".format(x.name, x.ID) for x in frame.semTypes)
-        + "\n" * (len(frame.semTypes) > 0)
+        + '\n' * (len(frame.semTypes) > 0)
     )
 
     outstr += "\n[frameRelations] {0} frame relations\n".format(
         len(frame.frameRelations)
     )
-    outstr += "  " + "\n  ".join(repr(frel) for frel in frame.frameRelations) + "\n"
+    outstr += '  ' + '\n  '.join(repr(frel) for frel in frame.frameRelations) + '\n'
 
     outstr += "\n[lexUnit] {0} lexical units\n".format(len(frame.lexUnit))
     lustrs = []
     for luName, lu in sorted(frame.lexUnit.items()):
-        tmpstr = "{0} ({1})".format(luName, lu.ID)
+        tmpstr = '{0} ({1})'.format(luName, lu.ID)
         lustrs.append(tmpstr)
-    outstr += "{0}\n".format(_pretty_longstring(", ".join(lustrs), prefix="  "))
+    outstr += "{0}\n".format(_pretty_longstring(', '.join(lustrs), prefix='  '))
 
     outstr += "\n[FE] {0} frame elements\n".format(len(frame.FE))
     fes = {}
@@ -750,23 +752,23 @@ def _pretty_frame(frame):
     for ct in sorted(
         fes.keys(),
         key=lambda ct2: [
-            "Core",
-            "Core-Unexpressed",
-            "Peripheral",
-            "Extra-Thematic",
+            'Core',
+            'Core-Unexpressed',
+            'Peripheral',
+            'Extra-Thematic',
         ].index(ct2),
     ):
-        outstr += "{0:>16}: {1}\n".format(ct, ", ".join(sorted(fes[ct])))
+        outstr += "{0:>16}: {1}\n".format(ct, ', '.join(sorted(fes[ct])))
 
     outstr += "\n[FEcoreSets] {0} frame element core sets\n".format(
         len(frame.FEcoreSets)
     )
     outstr += (
         "  "
-        + "\n  ".join(
+        + '\n  '.join(
             ", ".join([x.name for x in coreSet]) for coreSet in frame.FEcoreSets
         )
-        + "\n"
+        + '\n'
     )
 
     return outstr
@@ -777,6 +779,7 @@ class FramenetError(Exception):
     """An exception class for framenet-related errors."""
 
 
+@python_2_unicode_compatible
 class AttrDict(dict):
 
     """A class that wraps a dict and allows accessing the keys of the
@@ -802,7 +805,7 @@ class AttrDict(dict):
         self[name] = value
 
     def __getattr__(self, name):
-        if name == "_short_repr":
+        if name == '_short_repr':
             return self._short_repr
         return self[name]
 
@@ -813,59 +816,59 @@ class AttrDict(dict):
         return v
 
     def _short_repr(self):
-        if "_type" in self:
-            if self["_type"].endswith("relation"):
+        if '_type' in self:
+            if self['_type'].endswith('relation'):
                 return self.__repr__()
             try:
                 return "<{0} ID={1} name={2}>".format(
-                    self["_type"], self["ID"], self["name"]
+                    self['_type'], self['ID'], self['name']
                 )
             except KeyError:
                 try:  # no ID--e.g., for _type=lusubcorpus
-                    return "<{0} name={1}>".format(self["_type"], self["name"])
+                    return "<{0} name={1}>".format(self['_type'], self['name'])
                 except KeyError:  # no name--e.g., for _type=lusentence
-                    return "<{0} ID={1}>".format(self["_type"], self["ID"])
+                    return "<{0} ID={1}>".format(self['_type'], self['ID'])
         else:
             return self.__repr__()
 
     def _str(self):
         outstr = ""
 
-        if "_type" not in self:
+        if '_type' not in self:
             outstr = _pretty_any(self)
-        elif self["_type"] == "frame":
+        elif self['_type'] == 'frame':
             outstr = _pretty_frame(self)
-        elif self["_type"] == "fe":
+        elif self['_type'] == 'fe':
             outstr = _pretty_fe(self)
-        elif self["_type"] == "lu":
+        elif self['_type'] == 'lu':
             outstr = _pretty_lu(self)
-        elif self["_type"] == "luexemplars":  # list of ALL exemplars for LU
+        elif self['_type'] == 'luexemplars':  # list of ALL exemplars for LU
             outstr = _pretty_exemplars(self, self[0].LU)
         elif (
-            self["_type"] == "fulltext_annotation"
+            self['_type'] == 'fulltext_annotation'
         ):  # list of all sentences for full-text doc
             outstr = _pretty_fulltext_sentences(self)
-        elif self["_type"] == "lusentence":
+        elif self['_type'] == 'lusentence':
             outstr = _pretty_annotation(self)
-        elif self["_type"] == "fulltext_sentence":
+        elif self['_type'] == 'fulltext_sentence':
             outstr = _pretty_fulltext_sentence(self)
-        elif self["_type"] in ("luannotationset", "fulltext_annotationset"):
+        elif self['_type'] in ('luannotationset', 'fulltext_annotationset'):
             outstr = _pretty_annotation(self, aset_level=True)
-        elif self["_type"] == "posannotationset":
+        elif self['_type'] == 'posannotationset':
             outstr = _pretty_pos(self)
-        elif self["_type"] == "semtype":
+        elif self['_type'] == 'semtype':
             outstr = _pretty_semtype(self)
-        elif self["_type"] == "framerelationtype":
+        elif self['_type'] == 'framerelationtype':
             outstr = _pretty_frame_relation_type(self)
-        elif self["_type"] == "framerelation":
+        elif self['_type'] == 'framerelation':
             outstr = _pretty_frame_relation(self)
-        elif self["_type"] == "ferelation":
+        elif self['_type'] == 'ferelation':
             outstr = _pretty_fe_relation(self)
         else:
             outstr = _pretty_any(self)
 
         # ensure result is unicode string prior to applying the
-        #  decorator (because non-ASCII characters
+        # @python_2_unicode_compatible decorator (because non-ASCII characters
         # could in principle occur in the data and would trigger an encoding error when
         # passed as arguments to str.format()).
         # assert isinstance(outstr, unicode) # not in Python 3.2
@@ -878,6 +881,7 @@ class AttrDict(dict):
         return self.__str__()
 
 
+@python_2_unicode_compatible
 class SpecialList(list):
     """
     A list subclass which adds a '_type' attribute for special printing
@@ -894,7 +898,7 @@ class SpecialList(list):
         assert self._type
         if len(self) == 0:
             outstr = "[]"
-        elif self._type == "luexemplars":  # list of ALL exemplars for LU
+        elif self._type == 'luexemplars':  # list of ALL exemplars for LU
             outstr = _pretty_exemplars(self, self[0].LU)
         else:
             assert False, self._type
@@ -950,6 +954,7 @@ class Future(object):
         return self._data().__repr__()
 
 
+@python_2_unicode_compatible
 class PrettyDict(AttrDict):
     """
     Displays an abbreviated repr of values where possible.
@@ -958,22 +963,23 @@ class PrettyDict(AttrDict):
     """
 
     def __init__(self, *args, **kwargs):
-        _BREAK_LINES = kwargs.pop("breakLines", False)
+        _BREAK_LINES = kwargs.pop('breakLines', False)
         super(PrettyDict, self).__init__(*args, **kwargs)
-        dict.__setattr__(self, "_BREAK_LINES", _BREAK_LINES)
+        dict.__setattr__(self, '_BREAK_LINES', _BREAK_LINES)
 
     def __repr__(self):
         parts = []
         for k, v in sorted(self.items()):
-            kv = repr(k) + ": "
+            kv = repr(k) + ': '
             try:
                 kv += v._short_repr()
             except AttributeError:
                 kv += repr(v)
             parts.append(kv)
-        return "{" + (",\n " if self._BREAK_LINES else ", ").join(parts) + "}"
+        return '{' + (',\n ' if self._BREAK_LINES else ', ').join(parts) + '}'
 
 
+@python_2_unicode_compatible
 class PrettyList(list):
     """
     Displays an abbreviated repr of only the first several elements, not the whole list.
@@ -981,8 +987,8 @@ class PrettyList(list):
 
     # from nltk.util
     def __init__(self, *args, **kwargs):
-        self._MAX_REPR_SIZE = kwargs.pop("maxReprSize", 60)
-        self._BREAK_LINES = kwargs.pop("breakLines", False)
+        self._MAX_REPR_SIZE = kwargs.pop('maxReprSize', 60)
+        self._BREAK_LINES = kwargs.pop('breakLines', False)
         super(PrettyList, self).__init__(*args, **kwargs)
 
     def __repr__(self):
@@ -1000,12 +1006,13 @@ class PrettyList(list):
             )  # key difference from inherited version: call to _short_repr()
             length += len(pieces[-1]) + 2
             if self._MAX_REPR_SIZE and length > self._MAX_REPR_SIZE and len(pieces) > 2:
-                return "[%s, ...]" % str(
-                    ",\n " if self._BREAK_LINES else ", "
+                return "[%s, ...]" % text_type(
+                    ',\n ' if self._BREAK_LINES else ', '
                 ).join(pieces[:-1])
-        return "[%s]" % str(",\n " if self._BREAK_LINES else ", ").join(pieces)
+        return "[%s]" % text_type(',\n ' if self._BREAK_LINES else ', ').join(pieces)
 
 
+@python_2_unicode_compatible
 class PrettyLazyMap(LazyMap):
     """
     Displays an abbreviated repr of only the first several elements, not the whole list.
@@ -1028,10 +1035,11 @@ class PrettyLazyMap(LazyMap):
             )  # key difference from inherited version: call to _short_repr()
             length += len(pieces[-1]) + 2
             if length > self._MAX_REPR_SIZE and len(pieces) > 2:
-                return "[%s, ...]" % str(", ").join(pieces[:-1])
-        return "[%s]" % str(", ").join(pieces)
+                return "[%s, ...]" % text_type(', ').join(pieces[:-1])
+        return "[%s]" % text_type(', ').join(pieces)
 
 
+@python_2_unicode_compatible
 class PrettyLazyIteratorList(LazyIteratorList):
     """
     Displays an abbreviated repr of only the first several elements, not the whole list.
@@ -1054,10 +1062,11 @@ class PrettyLazyIteratorList(LazyIteratorList):
             )  # key difference from inherited version: call to _short_repr()
             length += len(pieces[-1]) + 2
             if length > self._MAX_REPR_SIZE and len(pieces) > 2:
-                return "[%s, ...]" % str(", ").join(pieces[:-1])
-        return "[%s]" % str(", ").join(pieces)
+                return "[%s, ...]" % text_type(', ').join(pieces[:-1])
+        return "[%s]" % text_type(', ').join(pieces)
 
 
+@python_2_unicode_compatible
 class PrettyLazyConcatenation(LazyConcatenation):
     """
     Displays an abbreviated repr of only the first several elements, not the whole list.
@@ -1080,8 +1089,8 @@ class PrettyLazyConcatenation(LazyConcatenation):
             )  # key difference from inherited version: call to _short_repr()
             length += len(pieces[-1]) + 2
             if length > self._MAX_REPR_SIZE and len(pieces) > 2:
-                return "[%s, ...]" % str(", ").join(pieces[:-1])
-        return "[%s]" % str(", ").join(pieces)
+                return "[%s, ...]" % text_type(', ').join(pieces[:-1])
+        return "[%s]" % text_type(', ').join(pieces)
 
     def __add__(self, other):
         """Return a list concatenating self with other."""
@@ -1104,7 +1113,7 @@ class FramenetCorpusReader(XMLCorpusReader):
     True
     """
 
-    _bad_statuses = ["Problem"]
+    _bad_statuses = ['Problem']
     """
     When loading LUs for a frame, those whose status is in this list will be ignored.
     Due to caching, if user code modifies this, it should do so before loading any data.
@@ -1240,9 +1249,9 @@ warnings(True) to display corpus consistency warnings when loading data
             # otherwise weird ordering effects might result in incomplete information
         self._frame_idx = {}
         for f in XMLCorpusView(
-            self.abspath("frameIndex.xml"), "frameIndex/frame", self._handle_elt
+            self.abspath("frameIndex.xml"), 'frameIndex/frame', self._handle_elt
         ):
-            self._frame_idx[f["ID"]] = f
+            self._frame_idx[f['ID']] = f
 
     def _buildcorpusindex(self):
         # The total number of fulltext annotated documents in Framenet
@@ -1250,7 +1259,7 @@ warnings(True) to display corpus consistency warnings when loading data
         self._fulltext_idx = {}
         for doclist in XMLCorpusView(
             self.abspath("fulltextIndex.xml"),
-            "fulltextIndex/corpus",
+            'fulltextIndex/corpus',
             self._handle_fulltextindex_elt,
         ):
             for doc in doclist:
@@ -1261,10 +1270,10 @@ warnings(True) to display corpus consistency warnings when loading data
         # should not be very large
         self._lu_idx = {}
         for lu in XMLCorpusView(
-            self.abspath("luIndex.xml"), "luIndex/lu", self._handle_elt
+            self.abspath("luIndex.xml"), 'luIndex/lu', self._handle_elt
         ):
             self._lu_idx[
-                lu["ID"]
+                lu['ID']
             ] = lu  # populate with LU index entries. if any of these
             # are looked up they will be replaced by full LU objects.
 
@@ -1274,7 +1283,7 @@ warnings(True) to display corpus consistency warnings when loading data
             x
             for x in XMLCorpusView(
                 self.abspath("frRelation.xml"),
-                "frameRelations/frameRelationType",
+                'frameRelations/frameRelationType',
                 self._handle_framerelationtype_elt,
             )
         )
@@ -1309,7 +1318,7 @@ warnings(True) to display corpus consistency warnings when loading data
 
     def _warn(self, *message, **kwargs):
         if self._warnings:
-            kwargs.setdefault("file", sys.stderr)
+            kwargs.setdefault('file', sys.stderr)
             print(*message, **kwargs)
 
     def readme(self):
@@ -1394,7 +1403,7 @@ warnings(True) to display corpus consistency warnings when loading data
         locpath = os.path.join("{0}".format(self._root), self._fulltext_dir, xmlfname)
 
         # Grab the top-level xml element containing the fulltext annotation
-        elt = XMLCorpusView(locpath, "fullTextAnnotation")[0]
+        elt = XMLCorpusView(locpath, 'fullTextAnnotation')[0]
         info = self._handle_fulltextannotation_elt(elt)
         # add metadata
         for k, v in self._fulltext_idx[fn_docid].items():
@@ -1432,14 +1441,14 @@ warnings(True) to display corpus consistency warnings when loading data
         # get the name of the frame with this id number
         try:
             fentry = self._frame_idx[fn_fid]
-            if "_type" in fentry:
+            if '_type' in fentry:
                 return fentry  # full frame object is cached
-            name = fentry["name"]
+            name = fentry['name']
         except TypeError:
             self._buildframeindex()
-            name = self._frame_idx[fn_fid]["name"]
+            name = self._frame_idx[fn_fid]['name']
         except KeyError:
-            raise FramenetError("Unknown frame id: {0}".format(fn_fid))
+            raise FramenetError('Unknown frame id: {0}'.format(fn_fid))
 
         return self.frame_by_name(name, ignorekeys, check_cache=False)
 
@@ -1482,18 +1491,18 @@ warnings(True) to display corpus consistency warnings when loading data
         # print(locpath, file=sys.stderr)
         # Grab the xml for the frame
         try:
-            elt = XMLCorpusView(locpath, "frame")[0]
+            elt = XMLCorpusView(locpath, 'frame')[0]
         except IOError:
-            raise FramenetError("Unknown frame: {0}".format(fn_fname))
+            raise FramenetError('Unknown frame: {0}'.format(fn_fname))
 
         fentry = self._handle_frame_elt(elt, ignorekeys)
         assert fentry
 
-        fentry.URL = self._fnweb_url + "/" + self._frame_dir + "/" + fn_fname + ".xml"
+        fentry.URL = self._fnweb_url + '/' + self._frame_dir + '/' + fn_fname + '.xml'
 
         # INFERENCE RULE: propagate lexical semtypes from the frame to all its LUs
         for st in fentry.semTypes:
-            if st.rootType.name == "Lexical_type":
+            if st.rootType.name == 'Lexical_type':
                 for lu in fentry.lexUnit.values():
                     if not any(
                         x is st for x in lu.semTypes
@@ -1502,12 +1511,12 @@ warnings(True) to display corpus consistency warnings when loading data
 
         self._frame_idx[fentry.ID] = fentry
         self._cached_frames[fentry.name] = fentry.ID
-        """
+        '''
         # now set up callables to resolve the LU pointers lazily.
         # (could also do this here--caching avoids infinite recursion.)
         for luName,luinfo in fentry.lexUnit.items():
             fentry.lexUnit[luName] = (lambda luID: Future(lambda: self.lu(luID)))(luinfo.ID)
-        """
+        '''
         return fentry
 
     def frame(self, fn_fid_or_fname, ignorekeys=[]):
@@ -1579,7 +1588,7 @@ warnings(True) to display corpus consistency warnings when loading data
         """
 
         # get the frame info by name or id number
-        if isinstance(fn_fid_or_fname, str):
+        if isinstance(fn_fid_or_fname, string_types):
             f = self.frame_by_name(fn_fid_or_fname, ignorekeys)
         else:
             f = self.frame_by_id(fn_fid_or_fname, ignorekeys)
@@ -1625,7 +1634,7 @@ warnings(True) to display corpus consistency warnings when loading data
         >>> lu # doctest: +ELLIPSIS
         {'ID': 256,
          'POS': 'V',
-         'URL': 'https://framenet2.icsi.berkeley.edu/fnReports/data/lu/lu256.xml',
+         'URL': u'https://framenet2.icsi.berkeley.edu/fnReports/data/lu/lu256.xml',
          '_type': 'lu',
          'cBy': ...,
          'cDate': '02/08/2001 01:27:50 PST Thu',
@@ -1644,7 +1653,7 @@ warnings(True) to display corpus consistency warnings when loading data
         :return: Basic information about the lexical unit
         :rtype: dict
         """
-        return self.lu(fn_luid, ignorekeys=["subCorpus", "exemplars"])
+        return self.lu(fn_luid, ignorekeys=['subCorpus', 'exemplars'])
 
     def lu(self, fn_luid, ignorekeys=[], luName=None, frameID=None, frameName=None):
         """
@@ -1784,24 +1793,24 @@ warnings(True) to display corpus consistency warnings when loading data
             # luName, frameID, and frameName. However, this will not be listed
             # among the LUs for its frame.
             self._warn(
-                "LU ID not found: {0} ({1}) in {2} ({3})".format(
+                'LU ID not found: {0} ({1}) in {2} ({3})'.format(
                     luName, fn_luid, frameName, frameID
                 )
             )
             luinfo = AttrDict(
                 {
-                    "_type": "lu",
-                    "ID": fn_luid,
-                    "name": luName,
-                    "frameID": frameID,
-                    "status": "Problem",
+                    '_type': 'lu',
+                    'ID': fn_luid,
+                    'name': luName,
+                    'frameID': frameID,
+                    'status': 'Problem',
                 }
             )
             f = self.frame_by_id(luinfo.frameID)
             assert f.name == frameName, (f.name, frameName)
-            luinfo["frame"] = f
+            luinfo['frame'] = f
             self._lu_idx[fn_luid] = luinfo
-        elif "_type" not in luinfo:
+        elif '_type' not in luinfo:
             # we only have an index entry for the LU. loading the frame will replace this.
             f = self.frame_by_id(luinfo.frameID)
             luinfo = self._lu_idx[fn_luid]
@@ -1826,22 +1835,22 @@ warnings(True) to display corpus consistency warnings when loading data
             self._buildluindex()
 
         try:
-            elt = XMLCorpusView(locpath, "lexUnit")[0]
+            elt = XMLCorpusView(locpath, 'lexUnit')[0]
         except IOError:
-            raise FramenetError("Unknown LU id: {0}".format(fn_luid))
+            raise FramenetError('Unknown LU id: {0}'.format(fn_luid))
 
         lu2 = self._handle_lexunit_elt(elt, ignorekeys)
-        lu.URL = self._fnweb_url + "/" + self._lu_dir + "/" + fname
+        lu.URL = self._fnweb_url + '/' + self._lu_dir + '/' + fname
         lu.subCorpus = lu2.subCorpus
         lu.exemplars = SpecialList(
-            "luexemplars", [sent for subc in lu.subCorpus for sent in subc.sentence]
+            'luexemplars', [sent for subc in lu.subCorpus for sent in subc.sentence]
         )
         for sent in lu.exemplars:
-            sent["LU"] = lu
-            sent["frame"] = lu.frame
+            sent['LU'] = lu
+            sent['frame'] = lu.frame
             for aset in sent.annotationSet:
-                aset["LU"] = lu
-                aset["frame"] = lu.frame
+                aset['LU'] = lu
+                aset['frame'] = lu.frame
 
         return lu
 
@@ -1852,14 +1861,14 @@ warnings(True) to display corpus consistency warnings when loading data
             x
             for x in XMLCorpusView(
                 self.abspath("semTypes.xml"),
-                "semTypes/semType",
+                'semTypes/semType',
                 self._handle_semtype_elt,
             )
         ]
         for st in semtypeXML:
-            n = st["name"]
-            a = st["abbrev"]
-            i = st["ID"]
+            n = st['name']
+            a = st['abbrev']
+            i = st['ID']
             # Both name and abbrev should be able to retrieve the
             # ID. The ID will retrieve the semantic type dict itself.
             self._semtypes[n] = i
@@ -1926,7 +1935,7 @@ warnings(True) to display corpus consistency warnings when loading data
                             changed = True
                             nPropagations += 1
                     if (
-                        ferel.type.name in ["Perspective_on", "Subframe", "Precedes"]
+                        ferel.type.name in ['Perspective_on', 'Subframe', 'Precedes']
                         and subST
                         and subST is not superST
                     ):
@@ -2073,7 +2082,7 @@ warnings(True) to display corpus consistency warnings when loading data
         )
 
     def fes(self, name=None, frame=None):
-        """
+        '''
         Lists frame element objects. If 'name' is provided, this is treated as
         a case-insensitive regular expression to filter by frame name.
         (Case-insensitivity is because casing of frame element names is not always
@@ -2105,12 +2114,12 @@ warnings(True) to display corpus consistency warnings when loading data
         :type name: str
         :return: A list of matching frame elements
         :rtype: list(AttrDict)
-        """
+        '''
         # what frames are we searching in?
         if frame is not None:
             if isinstance(frame, int):
                 frames = [self.frame(frame)]
-            elif isinstance(frame, str):
+            elif isinstance(frame, string_types):
                 frames = self.frames(frame)
             else:
                 frames = [frame]
@@ -2239,7 +2248,7 @@ warnings(True) to display corpus consistency warnings when loading data
             if frame is not None:
                 if isinstance(frame, int):
                     frameIDs = {frame}
-                elif isinstance(frame, str):
+                elif isinstance(frame, string_types):
                     frameIDs = {f.ID for f in self.frames(frame)}
                 else:
                     frameIDs = {frame.ID}
@@ -2247,7 +2256,7 @@ warnings(True) to display corpus consistency warnings when loading data
         elif frame is not None:  # all LUs in matching frames
             if isinstance(frame, int):
                 frames = [self.frame(frame)]
-            elif isinstance(frame, str):
+            elif isinstance(frame, string_types):
                 frames = self.frames(frame)
             else:
                 frames = [frame]
@@ -2321,7 +2330,7 @@ warnings(True) to display corpus consistency warnings when loading data
             return ftlist
         else:
             return PrettyList(
-                x for x in ftlist if re.search(name, x["filename"]) is not None
+                x for x in ftlist if re.search(name, x['filename']) is not None
             )
 
     def docs(self, name=None):
@@ -2362,7 +2371,7 @@ warnings(True) to display corpus consistency warnings when loading data
                 aset
                 for sent in self.ft_sents()
                 for aset in sent.annotationSet[1:]
-                if luNamePattern is None or aset.get("luID", "CXN_ASET") in matchedLUIDs
+                if luNamePattern is None or aset.get('luID', 'CXN_ASET') in matchedLUIDs
             )
         else:
             ftpart = []
@@ -2383,17 +2392,17 @@ warnings(True) to display corpus consistency warnings when loading data
         be specified to retrieve sentences with both overt FEs (in either order).
         """
         if fe is None and fe2 is not None:
-            raise FramenetError("exemplars(..., fe=None, fe2=<value>) is not allowed")
+            raise FramenetError('exemplars(..., fe=None, fe2=<value>) is not allowed')
         elif fe is not None and fe2 is not None:
-            if not isinstance(fe2, str):
-                if isinstance(fe, str):
+            if not isinstance(fe2, string_types):
+                if isinstance(fe, string_types):
                     # fe2 is specific to a particular frame. swap fe and fe2 so fe is always used to determine the frame.
                     fe, fe2 = fe2, fe
                 elif fe.frame is not fe2.frame:  # ensure frames match
                     raise FramenetError(
-                        "exemplars() call with inconsistent `fe` and `fe2` specification (frames must match)"
+                        'exemplars() call with inconsistent `fe` and `fe2` specification (frames must match)'
                     )
-        if frame is None and fe is not None and not isinstance(fe, str):
+        if frame is None and fe is not None and not isinstance(fe, string_types):
             frame = fe.frame
 
         # narrow down to frames matching criteria
@@ -2402,7 +2411,7 @@ warnings(True) to display corpus consistency warnings when loading data
             list
         )  # frame name -> matching LUs, if luNamePattern is specified
         if frame is not None or luNamePattern is not None:
-            if frame is None or isinstance(frame, str):
+            if frame is None or isinstance(frame, string_types):
                 if luNamePattern is not None:
                     frames = set()
                     for lu in self.lus(luNamePattern, frame=frame):
@@ -2421,7 +2430,7 @@ warnings(True) to display corpus consistency warnings when loading data
                     lusByFrame = {frame.name: self.lus(luNamePattern, frame=frame)}
 
             if fe is not None:  # narrow to frames that define this FE
-                if isinstance(fe, str):
+                if isinstance(fe, string_types):
                     frames = PrettyLazyIteratorList(
                         f
                         for f in frames
@@ -2431,12 +2440,12 @@ warnings(True) to display corpus consistency warnings when loading data
                 else:
                     if fe.frame not in frames:
                         raise FramenetError(
-                            "exemplars() call with inconsistent `frame` and `fe` specification"
+                            'exemplars() call with inconsistent `frame` and `fe` specification'
                         )
                     frames = [fe.frame]
 
                 if fe2 is not None:  # narrow to frames that ALSO define this FE
-                    if isinstance(fe2, str):
+                    if isinstance(fe2, string_types):
                         frames = PrettyLazyIteratorList(
                             f
                             for f in frames
@@ -2463,13 +2472,13 @@ warnings(True) to display corpus consistency warnings when loading data
                 if fe is not None:
                     fes = (
                         {ffe for ffe in f.FE.keys() if re.search(fe, ffe, re.I)}
-                        if isinstance(fe, str)
+                        if isinstance(fe, string_types)
                         else {fe.name}
                     )
                     if fe2 is not None:
                         fes2 = (
                             {ffe for ffe in f.FE.keys() if re.search(fe2, ffe, re.I)}
-                            if isinstance(fe2, str)
+                            if isinstance(fe2, string_types)
                             else {fe2.name}
                         )
 
@@ -2494,9 +2503,9 @@ warnings(True) to display corpus consistency warnings when loading data
         If 'fes' is None, returns all overt FE names.
         """
         overtNames = set(list(zip(*ex.FE[0]))[2]) if ex.FE[0] else set()
-        if "FE2" in ex:
+        if 'FE2' in ex:
             overtNames |= set(list(zip(*ex.FE2[0]))[2]) if ex.FE2[0] else set()
-            if "FE3" in ex:
+            if 'FE3' in ex:
                 overtNames |= set(list(zip(*ex.FE3[0]))[2]) if ex.FE3[0] else set()
         return overtNames & fes if fes is not None else overtNames
 
@@ -2581,7 +2590,7 @@ warnings(True) to display corpus consistency warnings when loading data
 
         # lookup by 'frame'
         if frame is not None:
-            if isinstance(frame, dict) and "frameRelations" in frame:
+            if isinstance(frame, dict) and 'frameRelations' in frame:
                 rels = PrettyList(frame.frameRelations)
             else:
                 if not isinstance(frame, int):
@@ -2715,11 +2724,11 @@ warnings(True) to display corpus consistency warnings when loading data
 
         # Ignore these attributes when loading attributes from an xml node
         ignore_attrs = [  #'cBy', 'cDate', 'mDate', # <-- annotation metadata that could be of interest
-            "xsi",
-            "schemaLocation",
-            "xmlns",
-            "bgColor",
-            "fgColor",
+            'xsi',
+            'schemaLocation',
+            'xmlns',
+            'bgColor',
+            'fgColor',
         ]
 
         for attr in attr_dict:
@@ -2744,35 +2753,35 @@ warnings(True) to display corpus consistency warnings when loading data
         """
 
         try:
-            """
+            '''
             # Look for boundary issues in markup. (Sometimes FEs are pluralized in definitions.)
             m = re.search(r'\w[<][^/]|[<][/][^>]+[>](s\w|[a-rt-z0-9])', data)
             if m:
                 print('Markup boundary:', data[max(0,m.start(0)-10):m.end(0)+10].replace('\n',' '), file=sys.stderr)
-            """
-
-            data = data.replace("<t>", "")
-            data = data.replace("</t>", "")
-            data = re.sub('<fex name="[^"]+">', "", data)
-            data = data.replace("</fex>", "")
-            data = data.replace("<fen>", "")
-            data = data.replace("</fen>", "")
-            data = data.replace("<m>", "")
-            data = data.replace("</m>", "")
-            data = data.replace("<ment>", "")
-            data = data.replace("</ment>", "")
-            data = data.replace("<ex>", "'")
-            data = data.replace("</ex>", "'")
-            data = data.replace("<gov>", "")
-            data = data.replace("</gov>", "")
-            data = data.replace("<x>", "")
-            data = data.replace("</x>", "")
+            '''
+
+            data = data.replace('<t>', '')
+            data = data.replace('</t>', '')
+            data = re.sub('<fex name="[^"]+">', '', data)
+            data = data.replace('</fex>', '')
+            data = data.replace('<fen>', '')
+            data = data.replace('</fen>', '')
+            data = data.replace('<m>', '')
+            data = data.replace('</m>', '')
+            data = data.replace('<ment>', '')
+            data = data.replace('</ment>', '')
+            data = data.replace('<ex>', "'")
+            data = data.replace('</ex>', "'")
+            data = data.replace('<gov>', '')
+            data = data.replace('</gov>', '')
+            data = data.replace('<x>', '')
+            data = data.replace('</x>', '')
 
             # Get rid of <def-root> and </def-root> tags
-            data = data.replace("<def-root>", "")
-            data = data.replace("</def-root>", "")
+            data = data.replace('<def-root>', '')
+            data = data.replace('</def-root>', '')
 
-            data = data.replace("\n", " ")
+            data = data.replace('\n', ' ')
         except AttributeError:
             pass
 
@@ -2799,15 +2808,15 @@ warnings(True) to display corpus consistency warnings when loading data
         corpid = ftinfo.ID
         retlist = []
         for sub in elt:
-            if sub.tag.endswith("document"):
+            if sub.tag.endswith('document'):
                 doc = self._load_xml_attributes(AttrDict(), sub)
-                if "name" in doc:
+                if 'name' in doc:
                     docname = doc.name
                 else:
                     docname = doc.description
                 doc.filename = "{0}__{1}.xml".format(corpname, docname)
                 doc.URL = (
-                    self._fnweb_url + "/" + self._fulltext_dir + "/" + doc.filename
+                    self._fnweb_url + '/' + self._fulltext_dir + '/' + doc.filename
                 )
                 doc.corpname = corpname
                 doc.corpid = corpid
@@ -2819,59 +2828,59 @@ warnings(True) to display corpus consistency warnings when loading data
         """Load the info for a Frame from a frame xml file"""
         frinfo = self._load_xml_attributes(AttrDict(), elt)
 
-        frinfo["_type"] = "frame"
-        frinfo["definition"] = ""
-        frinfo["definitionMarkup"] = ""
-        frinfo["FE"] = PrettyDict()
-        frinfo["FEcoreSets"] = []
-        frinfo["lexUnit"] = PrettyDict()
-        frinfo["semTypes"] = []
+        frinfo['_type'] = 'frame'
+        frinfo['definition'] = ""
+        frinfo['definitionMarkup'] = ""
+        frinfo['FE'] = PrettyDict()
+        frinfo['FEcoreSets'] = []
+        frinfo['lexUnit'] = PrettyDict()
+        frinfo['semTypes'] = []
         for k in ignorekeys:
             if k in frinfo:
                 del frinfo[k]
 
         for sub in elt:
-            if sub.tag.endswith("definition") and "definition" not in ignorekeys:
-                frinfo["definitionMarkup"] = sub.text
-                frinfo["definition"] = self._strip_tags(sub.text)
-            elif sub.tag.endswith("FE") and "FE" not in ignorekeys:
+            if sub.tag.endswith('definition') and 'definition' not in ignorekeys:
+                frinfo['definitionMarkup'] = sub.text
+                frinfo['definition'] = self._strip_tags(sub.text)
+            elif sub.tag.endswith('FE') and 'FE' not in ignorekeys:
                 feinfo = self._handle_fe_elt(sub)
-                frinfo["FE"][feinfo.name] = feinfo
-                feinfo["frame"] = frinfo  # backpointer
-            elif sub.tag.endswith("FEcoreSet") and "FEcoreSet" not in ignorekeys:
+                frinfo['FE'][feinfo.name] = feinfo
+                feinfo['frame'] = frinfo  # backpointer
+            elif sub.tag.endswith('FEcoreSet') and 'FEcoreSet' not in ignorekeys:
                 coreset = self._handle_fecoreset_elt(sub)
                 # assumes all FEs have been loaded before coresets
-                frinfo["FEcoreSets"].append(
-                    PrettyList(frinfo["FE"][fe.name] for fe in coreset)
+                frinfo['FEcoreSets'].append(
+                    PrettyList(frinfo['FE'][fe.name] for fe in coreset)
                 )
-            elif sub.tag.endswith("lexUnit") and "lexUnit" not in ignorekeys:
+            elif sub.tag.endswith('lexUnit') and 'lexUnit' not in ignorekeys:
                 luentry = self._handle_framelexunit_elt(sub)
-                if luentry["status"] in self._bad_statuses:
+                if luentry['status'] in self._bad_statuses:
                     # problematic LU entry; ignore it
                     continue
-                luentry["frame"] = frinfo
-                luentry["URL"] = (
+                luentry['frame'] = frinfo
+                luentry['URL'] = (
                     self._fnweb_url
-                    + "/"
+                    + '/'
                     + self._lu_dir
-                    + "/"
-                    + "lu{0}.xml".format(luentry["ID"])
+                    + '/'
+                    + "lu{0}.xml".format(luentry['ID'])
                 )
-                luentry["subCorpus"] = Future(
+                luentry['subCorpus'] = Future(
                     (lambda lu: lambda: self._lu_file(lu).subCorpus)(luentry)
                 )
-                luentry["exemplars"] = Future(
+                luentry['exemplars'] = Future(
                     (lambda lu: lambda: self._lu_file(lu).exemplars)(luentry)
                 )
-                frinfo["lexUnit"][luentry.name] = luentry
+                frinfo['lexUnit'][luentry.name] = luentry
                 if not self._lu_idx:
                     self._buildluindex()
                 self._lu_idx[luentry.ID] = luentry
-            elif sub.tag.endswith("semType") and "semTypes" not in ignorekeys:
+            elif sub.tag.endswith('semType') and 'semTypes' not in ignorekeys:
                 semtypeinfo = self._load_xml_attributes(AttrDict(), sub)
-                frinfo["semTypes"].append(self.semtype(semtypeinfo.ID))
+                frinfo['semTypes'].append(self.semtype(semtypeinfo.ID))
 
-        frinfo["frameRelations"] = self.frame_relations(frame=frinfo)
+        frinfo['frameRelations'] = self.frame_relations(frame=frinfo)
 
         # resolve 'requires' and 'excludes' links between FEs of this frame
         for fe in frinfo.FE.values():
@@ -2898,32 +2907,32 @@ warnings(True) to display corpus consistency warnings when loading data
     def _handle_framerelationtype_elt(self, elt, *args):
         """Load frame-relation element and its child fe-relation elements from frRelation.xml."""
         info = self._load_xml_attributes(AttrDict(), elt)
-        info["_type"] = "framerelationtype"
-        info["frameRelations"] = PrettyList()
+        info['_type'] = 'framerelationtype'
+        info['frameRelations'] = PrettyList()
 
         for sub in elt:
-            if sub.tag.endswith("frameRelation"):
+            if sub.tag.endswith('frameRelation'):
                 frel = self._handle_framerelation_elt(sub)
-                frel["type"] = info  # backpointer
+                frel['type'] = info  # backpointer
                 for ferel in frel.feRelations:
-                    ferel["type"] = info
-                info["frameRelations"].append(frel)
+                    ferel['type'] = info
+                info['frameRelations'].append(frel)
 
         return info
 
     def _handle_framerelation_elt(self, elt):
         """Load frame-relation element and its child fe-relation elements from frRelation.xml."""
         info = self._load_xml_attributes(AttrDict(), elt)
-        assert info["superFrameName"] != info["subFrameName"], (elt, info)
-        info["_type"] = "framerelation"
-        info["feRelations"] = PrettyList()
+        assert info['superFrameName'] != info['subFrameName'], (elt, info)
+        info['_type'] = 'framerelation'
+        info['feRelations'] = PrettyList()
 
         for sub in elt:
-            if sub.tag.endswith("FERelation"):
+            if sub.tag.endswith('FERelation'):
                 ferel = self._handle_elt(sub)
-                ferel["_type"] = "ferelation"
-                ferel["frameRelation"] = info  # backpointer
-                info["feRelations"].append(ferel)
+                ferel['_type'] = 'ferelation'
+                ferel['frameRelation'] = info  # backpointer
+                info['feRelations'].append(ferel)
 
         return info
 
@@ -2933,16 +2942,16 @@ warnings(True) to display corpus consistency warnings when loading data
         element (which we ignore here) and a bunch of 'sentence'
         elements."""
         info = AttrDict()
-        info["_type"] = "fulltext_annotation"
-        info["sentence"] = []
+        info['_type'] = 'fulltext_annotation'
+        info['sentence'] = []
 
         for sub in elt:
-            if sub.tag.endswith("header"):
+            if sub.tag.endswith('header'):
                 continue  # not used
-            elif sub.tag.endswith("sentence"):
+            elif sub.tag.endswith('sentence'):
                 s = self._handle_fulltext_sentence_elt(sub)
                 s.doc = info
-                info["sentence"].append(s)
+                info['sentence'].append(s)
 
         return info
 
@@ -2951,28 +2960,28 @@ warnings(True) to display corpus consistency warnings when loading data
         'sentence' element contains a "text" and "annotationSet" sub
         elements."""
         info = self._load_xml_attributes(AttrDict(), elt)
-        info["_type"] = "fulltext_sentence"
-        info["annotationSet"] = []
-        info["targets"] = []
+        info['_type'] = "fulltext_sentence"
+        info['annotationSet'] = []
+        info['targets'] = []
         target_spans = set()
-        info["_ascii"] = types.MethodType(
+        info['_ascii'] = types.MethodType(
             _annotation_ascii, info
         )  # attach a method for this instance
-        info["text"] = ""
+        info['text'] = ""
 
         for sub in elt:
-            if sub.tag.endswith("text"):
-                info["text"] = self._strip_tags(sub.text)
-            elif sub.tag.endswith("annotationSet"):
+            if sub.tag.endswith('text'):
+                info['text'] = self._strip_tags(sub.text)
+            elif sub.tag.endswith('annotationSet'):
                 a = self._handle_fulltextannotationset_elt(
-                    sub, is_pos=(len(info["annotationSet"]) == 0)
+                    sub, is_pos=(len(info['annotationSet']) == 0)
                 )
-                if "cxnID" in a:  # ignoring construction annotations for now
+                if 'cxnID' in a:  # ignoring construction annotations for now
                     continue
                 a.sent = info
                 a.text = info.text
-                info["annotationSet"].append(a)
-                if "Target" in a:
+                info['annotationSet'].append(a)
+                if 'Target' in a:
                     for tspan in a.Target:
                         if tspan in target_spans:
                             self._warn(
@@ -2980,19 +2989,19 @@ warnings(True) to display corpus consistency warnings when loading data
                                     info.text[slice(*tspan)]
                                 ),
                                 tspan,
-                                "in sentence",
-                                info["ID"],
+                                'in sentence',
+                                info['ID'],
                                 info.text,
                             )
                             # this can happen in cases like "chemical and biological weapons"
                             # being annotated as "chemical weapons" and "biological weapons"
                         else:
                             target_spans.add(tspan)
-                    info["targets"].append((a.Target, a.luName, a.frameName))
+                    info['targets'].append((a.Target, a.luName, a.frameName))
 
-        assert info["annotationSet"][0].status == "UNANN"
-        info["POS"] = info["annotationSet"][0].POS
-        info["POS_tagset"] = info["annotationSet"][0].POS_tagset
+        assert info['annotationSet'][0].status == 'UNANN'
+        info['POS'] = info['annotationSet'][0].POS
+        info['POS_tagset'] = info['annotationSet'][0].POS_tagset
         return info
 
     def _handle_fulltextannotationset_elt(self, elt, is_pos=False):
@@ -3001,62 +3010,62 @@ warnings(True) to display corpus consistency warnings when loading data
 
         info = self._handle_luannotationset_elt(elt, is_pos=is_pos)
         if not is_pos:
-            info["_type"] = "fulltext_annotationset"
-            if "cxnID" not in info:  # ignoring construction annotations for now
-                info["LU"] = self.lu(
+            info['_type'] = 'fulltext_annotationset'
+            if 'cxnID' not in info:  # ignoring construction annotations for now
+                info['LU'] = self.lu(
                     info.luID,
                     luName=info.luName,
                     frameID=info.frameID,
                     frameName=info.frameName,
                 )
-                info["frame"] = info.LU.frame
+                info['frame'] = info.LU.frame
         return info
 
     def _handle_fulltextlayer_elt(self, elt):
         """Load information from the given 'layer' element. Each
         'layer' contains several "label" elements."""
         info = self._load_xml_attributes(AttrDict(), elt)
-        info["_type"] = "layer"
-        info["label"] = []
+        info['_type'] = 'layer'
+        info['label'] = []
 
         for sub in elt:
-            if sub.tag.endswith("label"):
+            if sub.tag.endswith('label'):
                 l = self._load_xml_attributes(AttrDict(), sub)
-                info["label"].append(l)
+                info['label'].append(l)
 
         return info
 
     def _handle_framelexunit_elt(self, elt):
         """Load the lexical unit info from an xml element in a frame's xml file."""
         luinfo = AttrDict()
-        luinfo["_type"] = "lu"
+        luinfo['_type'] = 'lu'
         luinfo = self._load_xml_attributes(luinfo, elt)
         luinfo["definition"] = ""
         luinfo["definitionMarkup"] = ""
         luinfo["sentenceCount"] = PrettyDict()
-        luinfo["lexemes"] = PrettyList()  # multiword LUs have multiple lexemes
-        luinfo["semTypes"] = PrettyList()  # an LU can have multiple semtypes
+        luinfo['lexemes'] = PrettyList()  # multiword LUs have multiple lexemes
+        luinfo['semTypes'] = PrettyList()  # an LU can have multiple semtypes
 
         for sub in elt:
-            if sub.tag.endswith("definition"):
-                luinfo["definitionMarkup"] = sub.text
-                luinfo["definition"] = self._strip_tags(sub.text)
-            elif sub.tag.endswith("sentenceCount"):
-                luinfo["sentenceCount"] = self._load_xml_attributes(PrettyDict(), sub)
-            elif sub.tag.endswith("lexeme"):
+            if sub.tag.endswith('definition'):
+                luinfo['definitionMarkup'] = sub.text
+                luinfo['definition'] = self._strip_tags(sub.text)
+            elif sub.tag.endswith('sentenceCount'):
+                luinfo['sentenceCount'] = self._load_xml_attributes(PrettyDict(), sub)
+            elif sub.tag.endswith('lexeme'):
                 lexemeinfo = self._load_xml_attributes(PrettyDict(), sub)
-                if not isinstance(lexemeinfo.name, str):
+                if not isinstance(lexemeinfo.name, string_types):
                     # some lexeme names are ints by default: e.g.,
                     # thousand.num has lexeme with name="1000"
                     lexemeinfo.name = str(lexemeinfo.name)
-                luinfo["lexemes"].append(lexemeinfo)
-            elif sub.tag.endswith("semType"):
+                luinfo['lexemes'].append(lexemeinfo)
+            elif sub.tag.endswith('semType'):
                 semtypeinfo = self._load_xml_attributes(PrettyDict(), sub)
-                luinfo["semTypes"].append(self.semtype(semtypeinfo.ID))
+                luinfo['semTypes'].append(self.semtype(semtypeinfo.ID))
 
         # sort lexemes by 'order' attribute
         # otherwise, e.g., 'write down.v' may have lexemes in wrong order
-        luinfo["lexemes"].sort(key=lambda x: x.order)
+        luinfo['lexemes'].sort(key=lambda x: x.order)
 
         return luinfo
 
@@ -3067,33 +3076,33 @@ warnings(True) to display corpus consistency warnings when loading data
         (which are not included in frame files).
         """
         luinfo = self._load_xml_attributes(AttrDict(), elt)
-        luinfo["_type"] = "lu"
-        luinfo["definition"] = ""
-        luinfo["definitionMarkup"] = ""
-        luinfo["subCorpus"] = PrettyList()
-        luinfo["lexemes"] = PrettyList()  # multiword LUs have multiple lexemes
-        luinfo["semTypes"] = PrettyList()  # an LU can have multiple semtypes
+        luinfo['_type'] = 'lu'
+        luinfo['definition'] = ""
+        luinfo['definitionMarkup'] = ""
+        luinfo['subCorpus'] = PrettyList()
+        luinfo['lexemes'] = PrettyList()  # multiword LUs have multiple lexemes
+        luinfo['semTypes'] = PrettyList()  # an LU can have multiple semtypes
         for k in ignorekeys:
             if k in luinfo:
                 del luinfo[k]
 
         for sub in elt:
-            if sub.tag.endswith("header"):
+            if sub.tag.endswith('header'):
                 continue  # not used
-            elif sub.tag.endswith("valences"):
+            elif sub.tag.endswith('valences'):
                 continue  # not used
-            elif sub.tag.endswith("definition") and "definition" not in ignorekeys:
-                luinfo["definitionMarkup"] = sub.text
-                luinfo["definition"] = self._strip_tags(sub.text)
-            elif sub.tag.endswith("subCorpus") and "subCorpus" not in ignorekeys:
+            elif sub.tag.endswith('definition') and 'definition' not in ignorekeys:
+                luinfo['definitionMarkup'] = sub.text
+                luinfo['definition'] = self._strip_tags(sub.text)
+            elif sub.tag.endswith('subCorpus') and 'subCorpus' not in ignorekeys:
                 sc = self._handle_lusubcorpus_elt(sub)
                 if sc is not None:
-                    luinfo["subCorpus"].append(sc)
-            elif sub.tag.endswith("lexeme") and "lexeme" not in ignorekeys:
-                luinfo["lexemes"].append(self._load_xml_attributes(PrettyDict(), sub))
-            elif sub.tag.endswith("semType") and "semType" not in ignorekeys:
+                    luinfo['subCorpus'].append(sc)
+            elif sub.tag.endswith('lexeme') and 'lexeme' not in ignorekeys:
+                luinfo['lexemes'].append(self._load_xml_attributes(PrettyDict(), sub))
+            elif sub.tag.endswith('semType') and 'semType' not in ignorekeys:
                 semtypeinfo = self._load_xml_attributes(AttrDict(), sub)
-                luinfo["semTypes"].append(self.semtype(semtypeinfo.ID))
+                luinfo['semTypes'].append(self.semtype(semtypeinfo.ID))
 
         return luinfo
 
@@ -3101,99 +3110,99 @@ warnings(True) to display corpus consistency warnings when loading data
         """Load a subcorpus of a lexical unit from the given xml."""
         sc = AttrDict()
         try:
-            sc["name"] = elt.get("name")
+            sc['name'] = elt.get('name')
         except AttributeError:
             return None
-        sc["_type"] = "lusubcorpus"
-        sc["sentence"] = []
+        sc['_type'] = "lusubcorpus"
+        sc['sentence'] = []
 
         for sub in elt:
-            if sub.tag.endswith("sentence"):
+            if sub.tag.endswith('sentence'):
                 s = self._handle_lusentence_elt(sub)
                 if s is not None:
-                    sc["sentence"].append(s)
+                    sc['sentence'].append(s)
 
         return sc
 
     def _handle_lusentence_elt(self, elt):
         """Load a sentence from a subcorpus of an LU from xml."""
         info = self._load_xml_attributes(AttrDict(), elt)
-        info["_type"] = "lusentence"
-        info["annotationSet"] = []
-        info["_ascii"] = types.MethodType(
+        info['_type'] = 'lusentence'
+        info['annotationSet'] = []
+        info['_ascii'] = types.MethodType(
             _annotation_ascii, info
         )  # attach a method for this instance
         for sub in elt:
-            if sub.tag.endswith("text"):
-                info["text"] = self._strip_tags(sub.text)
-            elif sub.tag.endswith("annotationSet"):
+            if sub.tag.endswith('text'):
+                info['text'] = self._strip_tags(sub.text)
+            elif sub.tag.endswith('annotationSet'):
                 annset = self._handle_luannotationset_elt(
-                    sub, is_pos=(len(info["annotationSet"]) == 0)
+                    sub, is_pos=(len(info['annotationSet']) == 0)
                 )
                 if annset is not None:
-                    assert annset.status == "UNANN" or "FE" in annset, annset
-                    if annset.status != "UNANN":
-                        info["frameAnnotation"] = annset
+                    assert annset.status == 'UNANN' or 'FE' in annset, annset
+                    if annset.status != 'UNANN':
+                        info['frameAnnotation'] = annset
                     # copy layer info up to current level
                     for k in (
-                        "Target",
-                        "FE",
-                        "FE2",
-                        "FE3",
-                        "GF",
-                        "PT",
-                        "POS",
-                        "POS_tagset",
-                        "Other",
-                        "Sent",
-                        "Verb",
-                        "Noun",
-                        "Adj",
-                        "Adv",
-                        "Prep",
-                        "Scon",
-                        "Art",
+                        'Target',
+                        'FE',
+                        'FE2',
+                        'FE3',
+                        'GF',
+                        'PT',
+                        'POS',
+                        'POS_tagset',
+                        'Other',
+                        'Sent',
+                        'Verb',
+                        'Noun',
+                        'Adj',
+                        'Adv',
+                        'Prep',
+                        'Scon',
+                        'Art',
                     ):
                         if k in annset:
                             info[k] = annset[k]
-                    info["annotationSet"].append(annset)
-                    annset["sent"] = info
-                    annset["text"] = info.text
+                    info['annotationSet'].append(annset)
+                    annset['sent'] = info
+                    annset['text'] = info.text
         return info
 
     def _handle_luannotationset_elt(self, elt, is_pos=False):
         """Load an annotation set from a sentence in an subcorpus of an LU"""
         info = self._load_xml_attributes(AttrDict(), elt)
-        info["_type"] = "posannotationset" if is_pos else "luannotationset"
-        info["layer"] = []
-        info["_ascii"] = types.MethodType(
+        info['_type'] = 'posannotationset' if is_pos else 'luannotationset'
+        info['layer'] = []
+        info['_ascii'] = types.MethodType(
             _annotation_ascii, info
         )  # attach a method for this instance
 
-        if "cxnID" in info:  # ignoring construction annotations for now.
+        if 'cxnID' in info:  # ignoring construction annotations for now.
             return info
 
         for sub in elt:
-            if sub.tag.endswith("layer"):
+            if sub.tag.endswith('layer'):
                 l = self._handle_lulayer_elt(sub)
                 if l is not None:
                     overt = []
                     ni = {}  # null instantiations
 
-                    info["layer"].append(l)
+                    info['layer'].append(l)
                     for lbl in l.label:
-                        if "start" in lbl:
+                        if 'start' in lbl:
                             thespan = (lbl.start, lbl.end + 1, lbl.name)
                             if l.name not in (
-                                "Sent",
-                                "Other",
+                                'Sent',
+                                'Other',
                             ):  # 'Sent' and 'Other' layers sometimes contain accidental duplicate spans
                                 assert thespan not in overt, (info.ID, l.name, thespan)
                             overt.append(thespan)
                         else:  # null instantiation
                             if lbl.name in ni:
                                 self._warn(
-                                    "FE with multiple NI entries:",
+                                    'FE with multiple NI entries:',
                                     lbl.name,
                                     ni[lbl.name],
                                     lbl.itype,
@@ -3202,120 +3211,120 @@ warnings(True) to display corpus consistency warnings when loading data
                                 ni[lbl.name] = lbl.itype
                     overt = sorted(overt)
 
-                    if l.name == "Target":
+                    if l.name == 'Target':
                         if not overt:
                             self._warn(
-                                "Skipping empty Target layer in annotation set ID={0}".format(
+                                'Skipping empty Target layer in annotation set ID={0}'.format(
                                     info.ID
                                 )
                             )
                             continue
-                        assert all(lblname == "Target" for i, j, lblname in overt)
-                        if "Target" in info:
+                        assert all(lblname == 'Target' for i, j, lblname in overt)
+                        if 'Target' in info:
                             self._warn(
-                                "Annotation set {0} has multiple Target layers".format(
+                                'Annotation set {0} has multiple Target layers'.format(
                                     info.ID
                                 )
                             )
                         else:
-                            info["Target"] = [(i, j) for (i, j, _) in overt]
-                    elif l.name == "FE":
+                            info['Target'] = [(i, j) for (i, j, _) in overt]
+                    elif l.name == 'FE':
                         if l.rank == 1:
-                            assert "FE" not in info
-                            info["FE"] = (overt, ni)
+                            assert 'FE' not in info
+                            info['FE'] = (overt, ni)
                             # assert False,info
                         else:
                             # sometimes there are 3 FE layers! e.g. Change_position_on_a_scale.fall.v
                             assert 2 <= l.rank <= 3, l.rank
-                            k = "FE" + str(l.rank)
+                            k = 'FE' + str(l.rank)
                             assert k not in info
                             info[k] = (overt, ni)
-                    elif l.name in ("GF", "PT"):
+                    elif l.name in ('GF', 'PT'):
                         assert l.rank == 1
                         info[l.name] = overt
-                    elif l.name in ("BNC", "PENN"):
+                    elif l.name in ('BNC', 'PENN'):
                         assert l.rank == 1
-                        info["POS"] = overt
-                        info["POS_tagset"] = l.name
+                        info['POS'] = overt
+                        info['POS_tagset'] = l.name
                     else:
                         if is_pos:
-                            if l.name not in ("NER", "WSL"):
+                            if l.name not in ('NER', 'WSL'):
                                 self._warn(
-                                    "Unexpected layer in sentence annotationset:",
+                                    'Unexpected layer in sentence annotationset:',
                                     l.name,
                                 )
                         else:
                             if l.name not in (
-                                "Sent",
-                                "Verb",
-                                "Noun",
-                                "Adj",
-                                "Adv",
-                                "Prep",
-                                "Scon",
-                                "Art",
-                                "Other",
+                                'Sent',
+                                'Verb',
+                                'Noun',
+                                'Adj',
+                                'Adv',
+                                'Prep',
+                                'Scon',
+                                'Art',
+                                'Other',
                             ):
                                 self._warn(
-                                    "Unexpected layer in frame annotationset:", l.name
+                                    'Unexpected layer in frame annotationset:', l.name
                                 )
                         info[l.name] = overt
-        if not is_pos and "cxnID" not in info:
-            if "Target" not in info:
-                self._warn("Missing target in annotation set ID={0}".format(info.ID))
-            assert "FE" in info
-            if "FE3" in info:
-                assert "FE2" in info
+        if not is_pos and 'cxnID' not in info:
+            if 'Target' not in info:
+                self._warn('Missing target in annotation set ID={0}'.format(info.ID))
+            assert 'FE' in info
+            if 'FE3' in info:
+                assert 'FE2' in info
 
         return info
 
     def _handle_lulayer_elt(self, elt):
         """Load a layer from an annotation set"""
         layer = self._load_xml_attributes(AttrDict(), elt)
-        layer["_type"] = "lulayer"
-        layer["label"] = []
+        layer['_type'] = 'lulayer'
+        layer['label'] = []
 
         for sub in elt:
-            if sub.tag.endswith("label"):
+            if sub.tag.endswith('label'):
                 l = self._load_xml_attributes(AttrDict(), sub)
                 if l is not None:
-                    layer["label"].append(l)
+                    layer['label'].append(l)
         return layer
 
     def _handle_fe_elt(self, elt):
         feinfo = self._load_xml_attributes(AttrDict(), elt)
-        feinfo["_type"] = "fe"
-        feinfo["definition"] = ""
-        feinfo["definitionMarkup"] = ""
-        feinfo["semType"] = None
-        feinfo["requiresFE"] = None
-        feinfo["excludesFE"] = None
+        feinfo['_type'] = 'fe'
+        feinfo['definition'] = ""
+        feinfo['definitionMarkup'] = ""
+        feinfo['semType'] = None
+        feinfo['requiresFE'] = None
+        feinfo['excludesFE'] = None
         for sub in elt:
-            if sub.tag.endswith("definition"):
-                feinfo["definitionMarkup"] = sub.text
-                feinfo["definition"] = self._strip_tags(sub.text)
-            elif sub.tag.endswith("semType"):
+            if sub.tag.endswith('definition'):
+                feinfo['definitionMarkup'] = sub.text
+                feinfo['definition'] = self._strip_tags(sub.text)
+            elif sub.tag.endswith('semType'):
                 stinfo = self._load_xml_attributes(AttrDict(), sub)
-                feinfo["semType"] = self.semtype(stinfo.ID)
-            elif sub.tag.endswith("requiresFE"):
-                feinfo["requiresFE"] = self._load_xml_attributes(AttrDict(), sub)
-            elif sub.tag.endswith("excludesFE"):
-                feinfo["excludesFE"] = self._load_xml_attributes(AttrDict(), sub)
+                feinfo['semType'] = self.semtype(stinfo.ID)
+            elif sub.tag.endswith('requiresFE'):
+                feinfo['requiresFE'] = self._load_xml_attributes(AttrDict(), sub)
+            elif sub.tag.endswith('excludesFE'):
+                feinfo['excludesFE'] = self._load_xml_attributes(AttrDict(), sub)
 
         return feinfo
 
     def _handle_semtype_elt(self, elt, tagspec=None):
         semt = self._load_xml_attributes(AttrDict(), elt)
-        semt["_type"] = "semtype"
-        semt["superType"] = None
-        semt["subTypes"] = PrettyList()
+        semt['_type'] = 'semtype'
+        semt['superType'] = None
+        semt['subTypes'] = PrettyList()
         for sub in elt:
             if sub.text is not None:
-                semt["definitionMarkup"] = sub.text
-                semt["definition"] = self._strip_tags(sub.text)
+                semt['definitionMarkup'] = sub.text
+                semt['definition'] = self._strip_tags(sub.text)
             else:
                 supertypeinfo = self._load_xml_attributes(AttrDict(), sub)
-                semt["superType"] = supertypeinfo
+                semt['superType'] = supertypeinfo
                 # the supertype may not have been loaded yet
 
         return semt
@@ -3332,15 +3341,15 @@ def demo():
     # buildindexes(). We do this here just for demo purposes. If the
     # indexes are not built explicitely, they will be built as needed.
     #
-    print("Building the indexes...")
+    print('Building the indexes...')
     fn.buildindexes()
 
     #
     # Get some statistics about the corpus
     #
-    print("Number of Frames:", len(fn.frames()))
-    print("Number of Lexical Units:", len(fn.lus()))
-    print("Number of annotated documents:", len(fn.docs()))
+    print('Number of Frames:', len(fn.frames()))
+    print('Number of Lexical Units:', len(fn.lus()))
+    print('Number of annotated documents:', len(fn.docs()))
     print()
 
     #
@@ -3349,7 +3358,7 @@ def demo():
     print(
         'getting frames whose name matches the (case insensitive) regex: "(?i)medical"'
     )
-    medframes = fn.frames(r"(?i)medical")
+    medframes = fn.frames(r'(?i)medical')
     print('Found {0} Frames whose name matches "(?i)medical":'.format(len(medframes)))
     print([(f.name, f.ID) for f in medframes])
 
@@ -3369,7 +3378,7 @@ def demo():
         len(m_frame.frameRelations),
     )
     for fr in m_frame.frameRelations:
-        print("   ", fr)
+        print('   ', fr)
 
     #
     # get the names of the Frame Elements
@@ -3378,13 +3387,13 @@ def demo():
         '\nNumber of Frame Elements in the "{0}" frame:'.format(m_frame.name),
         len(m_frame.FE),
     )
-    print("   ", [x for x in m_frame.FE])
+    print('   ', [x for x in m_frame.FE])
 
     #
     # get the names of the "Core" Frame Elements
     #
     print('\nThe "core" Frame Elements in the "{0}" frame:'.format(m_frame.name))
-    print("   ", [x.name for x in m_frame.FE.values() if x.coreType == "Core"])
+    print('   ', [x.name for x in m_frame.FE.values() if x.coreType == "Core"])
 
     #
     # get all of the Lexical Units that are incorporated in the
@@ -3395,9 +3404,9 @@ def demo():
     ailment_lus = [
         x
         for x in m_frame.lexUnit.values()
-        if "incorporatedFE" in x and x.incorporatedFE == "Ailment"
+        if 'incorporatedFE' in x and x.incorporatedFE == 'Ailment'
     ]
-    print("   ", [x.name for x in ailment_lus])
+    print('   ', [x.name for x in ailment_lus])
 
     #
     # get all of the Lexical Units for the frame
@@ -3406,20 +3415,20 @@ def demo():
         '\nNumber of Lexical Units in the "{0}" frame:'.format(m_frame.name),
         len(m_frame.lexUnit),
     )
-    print("  ", [x.name for x in m_frame.lexUnit.values()][:5], "...")
+    print('  ', [x.name for x in m_frame.lexUnit.values()][:5], '...')
 
     #
     # get basic info on the second LU in the frame
     #
-    tmp_id = m_frame.lexUnit["ailment.n"].ID  # grab the id of the specified LU
+    tmp_id = m_frame.lexUnit['ailment.n'].ID  # grab the id of the specified LU
     luinfo = fn.lu_basic(tmp_id)  # get basic info on the LU
-    print("\nInformation on the LU: {0}".format(luinfo.name))
+    print('\nInformation on the LU: {0}'.format(luinfo.name))
     pprint(luinfo)
 
     #
     # Get a list of all of the corpora used for fulltext annotation
     #
-    print("\nNames of all of the corpora used for fulltext annotation:")
+    print('\nNames of all of the corpora used for fulltext annotation:')
     allcorpora = set(x.corpname for x in fn.docs_metadata())
     pprint(list(allcorpora))
 
@@ -3443,8 +3452,8 @@ def demo():
     print(
         '\nSearching for all Frames that have a lemma that matches the regexp: "^run.v$":'
     )
-    pprint(fn.frames_by_lemma(r"^run.v$"))
+    pprint(fn.frames_by_lemma(r'^run.v$'))
 
 
-if __name__ == "__main__":
+if __name__ == '__main__':
     demo()
index 6f80742..1628e9c 100644 (file)
@@ -1,6 +1,6 @@
 # Natural Language Toolkit: IEER Corpus Reader
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Steven Bird <stevenbird1@gmail.com>
 #         Edward Loper <edloper@gmail.com>
 # URL: <http://nltk.org/>
@@ -20,28 +20,32 @@ and filenames were shortened.
 The corpus contains the following files: APW_19980314, APW_19980424,
 APW_19980429, NYT_19980315, NYT_19980403, and NYT_19980407.
 """
+from __future__ import unicode_literals
+
+from six import string_types
 
 import nltk
+from nltk import compat
 from nltk.corpus.reader.api import *
 
 #: A dictionary whose keys are the names of documents in this corpus;
 #: and whose values are descriptions of those documents' contents.
 titles = {
-    "APW_19980314": "Associated Press Weekly, 14 March 1998",
-    "APW_19980424": "Associated Press Weekly, 24 April 1998",
-    "APW_19980429": "Associated Press Weekly, 29 April 1998",
-    "NYT_19980315": "New York Times, 15 March 1998",
-    "NYT_19980403": "New York Times, 3 April 1998",
-    "NYT_19980407": "New York Times, 7 April 1998",
+    'APW_19980314': 'Associated Press Weekly, 14 March 1998',
+    'APW_19980424': 'Associated Press Weekly, 24 April 1998',
+    'APW_19980429': 'Associated Press Weekly, 29 April 1998',
+    'NYT_19980315': 'New York Times, 15 March 1998',
+    'NYT_19980403': 'New York Times, 3 April 1998',
+    'NYT_19980407': 'New York Times, 7 April 1998',
 }
 
 #: A list of all documents in this corpus.
 documents = sorted(titles)
 
 
-
+@compat.python_2_unicode_compatible
 class IEERDocument(object):
-    def __init__(self, text, docno=None, doctype=None, date_time=None, headline=""):
+    def __init__(self, text, docno=None, doctype=None, date_time=None, headline=''):
         self.text = text
         self.docno = docno
         self.doctype = doctype
@@ -50,15 +54,15 @@ class IEERDocument(object):
 
     def __repr__(self):
         if self.headline:
-            headline = " ".join(self.headline.leaves())
+            headline = ' '.join(self.headline.leaves())
         else:
             headline = (
-                " ".join([w for w in self.text.leaves() if w[:1] != "<"][:12]) + "..."
+                ' '.join([w for w in self.text.leaves() if w[:1] != '<'][:12]) + '...'
             )
         if self.docno is not None:
-            return "<IEERDocument %s: %r>" % (self.docno, headline)
+            return '<IEERDocument %s: %r>' % (self.docno, headline)
         else:
-            return "<IEERDocument: %r>" % headline
+            return '<IEERDocument: %r>' % headline
 
 
 class IEERCorpusReader(CorpusReader):
@@ -68,7 +72,7 @@ class IEERCorpusReader(CorpusReader):
     def raw(self, fileids=None):
         if fileids is None:
             fileids = self._fileids
-        elif isinstance(fileids, str):
+        elif isinstance(fileids, string_types):
             fileids = [fileids]
         return concat([self.open(f).read() for f in fileids])
 
@@ -110,7 +114,7 @@ class IEERCorpusReader(CorpusReader):
             line = stream.readline()
             if not line:
                 break
-            if line.strip() == "<DOC>":
+            if line.strip() == '<DOC>':
                 break
         out.append(line)
         # Read the document
@@ -119,7 +123,7 @@ class IEERCorpusReader(CorpusReader):
             if not line:
                 break
             out.append(line)
-            if line.strip() == "</DOC>":
+            if line.strip() == '</DOC>':
                 break
         # Return the document
-        return ["\n".join(out)]
+        return ['\n'.join(out)]
index 0788b54..6f39754 100644 (file)
@@ -1,6 +1,6 @@
 # Natural Language Toolkit: Indian Language POS-Tagged Corpus Reader
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Steven Bird <stevenbird1@gmail.com>
 #         Edward Loper <edloper@gmail.com>
 # URL: <http://nltk.org/>
@@ -18,6 +18,8 @@ Contents:
   - Telugu: IIIT Hyderabad
 """
 
+from six import string_types
+
 from nltk.tag import str2tuple, map_tag
 
 from nltk.corpus.reader.util import *
@@ -72,7 +74,7 @@ class IndianCorpusReader(CorpusReader):
     def raw(self, fileids=None):
         if fileids is None:
             fileids = self._fileids
-        elif isinstance(fileids, str):
+        elif isinstance(fileids, string_types):
             fileids = [fileids]
         return concat([self.open(f).read() for f in fileids])
 
@@ -88,9 +90,9 @@ class IndianCorpusView(StreamBackedCorpusView):
 
     def read_block(self, stream):
         line = stream.readline()
-        if line.startswith("<"):
+        if line.startswith('<'):
             return []
-        sent = [str2tuple(word, sep="_") for word in line.split()]
+        sent = [str2tuple(word, sep='_') for word in line.split()]
         if self._tag_mapping_function:
             sent = [(w, self._tag_mapping_function(t)) for (w, t) in sent]
         if not self._tagged:
index de983dd..47c509d 100644 (file)
@@ -1,12 +1,14 @@
 # Natural Language Toolkit: IPI PAN Corpus Reader
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Konrad Goluchowski <kodie@mimuw.edu.pl>
 # URL: <http://nltk.org/>
 # For license information, see LICENSE.TXT
 
 import functools
 
+from six import string_types
+
 from nltk.corpus.reader.util import StreamBackedCorpusView, concat
 from nltk.corpus.reader.api import CorpusReader
 
@@ -14,7 +16,7 @@ from nltk.corpus.reader.api import CorpusReader
 def _parse_args(fun):
     @functools.wraps(fun)
     def decorator(self, fileids=None, **kwargs):
-        kwargs.pop("tags", None)
+        kwargs.pop('tags', None)
         if not fileids:
             fileids = self.fileids()
         return fun(self, fileids, **kwargs)
@@ -67,48 +69,48 @@ class IPIPANCorpusReader(CorpusReader):
 
         filecontents = []
         for fileid in self._list_morph_files(fileids):
-            with open(fileid, "r") as infile:
+            with open(fileid, 'r') as infile:
                 filecontents.append(infile.read())
-        return "".join(filecontents)
+        return ''.join(filecontents)
 
     def channels(self, fileids=None):
         if not fileids:
             fileids = self.fileids()
-        return self._parse_header(fileids, "channel")
+        return self._parse_header(fileids, 'channel')
 
     def domains(self, fileids=None):
         if not fileids:
             fileids = self.fileids()
-        return self._parse_header(fileids, "domain")
+        return self._parse_header(fileids, 'domain')
 
     def categories(self, fileids=None):
         if not fileids:
             fileids = self.fileids()
         return [
-            self._map_category(cat) for cat in self._parse_header(fileids, "keyTerm")
+            self._map_category(cat) for cat in self._parse_header(fileids, 'keyTerm')
         ]
 
     def fileids(self, channels=None, domains=None, categories=None):
         if channels is not None and domains is not None and categories is not None:
             raise ValueError(
-                "You can specify only one of channels, domains "
-                "and categories parameter at once"
+                'You can specify only one of channels, domains '
+                'and categories parameter at once'
             )
         if channels is None and domains is None and categories is None:
             return CorpusReader.fileids(self)
-        if isinstance(channels, str):
+        if isinstance(channels, string_types):
             channels = [channels]
-        if isinstance(domains, str):
+        if isinstance(domains, string_types):
             domains = [domains]
-        if isinstance(categories, str):
+        if isinstance(categories, string_types):
             categories = [categories]
         if channels:
-            return self._list_morph_files_by("channel", channels)
+            return self._list_morph_files_by('channel', channels)
         elif domains:
-            return self._list_morph_files_by("domain", domains)
+            return self._list_morph_files_by('domain', domains)
         else:
             return self._list_morph_files_by(
-                "keyTerm", categories, map=self._map_category
+                'keyTerm', categories, map=self._map_category
             )
 
     @_parse_args
@@ -171,7 +173,7 @@ class IPIPANCorpusReader(CorpusReader):
 
     def _list_header_files(self, fileids):
         return [
-            f.replace("morph.xml", "header.xml")
+            f.replace('morph.xml', 'header.xml')
             for f in self._list_morph_files(fileids)
         ]
 
@@ -187,7 +189,7 @@ class IPIPANCorpusReader(CorpusReader):
         fileids = self.fileids()
         ret_fileids = set()
         for f in fileids:
-            fp = self.abspath(f).replace("morph.xml", "header.xml")
+            fp = self.abspath(f).replace('morph.xml', 'header.xml')
             values_list = self._get_tag(fp, tag)
             for value in values_list:
                 if map is not None:
@@ -198,43 +200,43 @@ class IPIPANCorpusReader(CorpusReader):
 
     def _get_tag(self, f, tag):
         tags = []
-        with open(f, "r") as infile:
+        with open(f, 'r') as infile:
             header = infile.read()
         tag_end = 0
         while True:
-            tag_pos = header.find("<" + tag, tag_end)
+            tag_pos = header.find('<' + tag, tag_end)
             if tag_pos < 0:
                 return tags
-            tag_end = header.find("</" + tag + ">", tag_pos)
+            tag_end = header.find('</' + tag + '>', tag_pos)
             tags.append(header[tag_pos + len(tag) + 2 : tag_end])
 
     def _map_category(self, cat):
-        pos = cat.find(">")
+        pos = cat.find('>')
         if pos == -1:
             return cat
         else:
             return cat[pos + 1 :]
 
     def _view(self, filename, **kwargs):
-        tags = kwargs.pop("tags", True)
-        mode = kwargs.pop("mode", 0)
-        simplify_tags = kwargs.pop("simplify_tags", False)
-        one_tag = kwargs.pop("one_tag", True)
-        disamb_only = kwargs.pop("disamb_only", True)
-        append_no_space = kwargs.pop("append_no_space", False)
-        append_space = kwargs.pop("append_space", False)
-        replace_xmlentities = kwargs.pop("replace_xmlentities", True)
+        tags = kwargs.pop('tags', True)
+        mode = kwargs.pop('mode', 0)
+        simplify_tags = kwargs.pop('simplify_tags', False)
+        one_tag = kwargs.pop('one_tag', True)
+        disamb_only = kwargs.pop('disamb_only', True)
+        append_no_space = kwargs.pop('append_no_space', False)
+        append_space = kwargs.pop('append_space', False)
+        replace_xmlentities = kwargs.pop('replace_xmlentities', True)
 
         if len(kwargs) > 0:
-            raise ValueError("Unexpected arguments: %s" % kwargs.keys())
+            raise ValueError('Unexpected arguments: %s' % kwargs.keys())
         if not one_tag and not disamb_only:
             raise ValueError(
-                "You cannot specify both one_tag=False and " "disamb_only=False"
+                'You cannot specify both one_tag=False and ' 'disamb_only=False'
             )
         if not tags and (simplify_tags or not one_tag or not disamb_only):
             raise ValueError(
-                "You cannot specify simplify_tags, one_tag or "
-                "disamb_only with functions other than tagged_*"
+                'You cannot specify simplify_tags, one_tag or '
+                'disamb_only with functions other than tagged_*'
             )
 
         return IPIPANCorpusView(
@@ -261,14 +263,14 @@ class IPIPANCorpusView(StreamBackedCorpusView):
         self.in_sentence = False
         self.position = 0
 
-        self.show_tags = kwargs.pop("tags", True)
-        self.disamb_only = kwargs.pop("disamb_only", True)
-        self.mode = kwargs.pop("mode", IPIPANCorpusView.WORDS_MODE)
-        self.simplify_tags = kwargs.pop("simplify_tags", False)
-        self.one_tag = kwargs.pop("one_tag", True)
-        self.append_no_space = kwargs.pop("append_no_space", False)
-        self.append_space = kwargs.pop("append_space", False)
-        self.replace_xmlentities = kwargs.pop("replace_xmlentities", True)
+        self.show_tags = kwargs.pop('tags', True)
+        self.disamb_only = kwargs.pop('disamb_only', True)
+        self.mode = kwargs.pop('mode', IPIPANCorpusView.WORDS_MODE)
+        self.simplify_tags = kwargs.pop('simplify_tags', False)
+        self.one_tag = kwargs.pop('one_tag', True)
+        self.append_no_space = kwargs.pop('append_no_space', False)
+        self.append_space = kwargs.pop('append_space', False)
+        self.replace_xmlentities = kwargs.pop('replace_xmlentities', True)
 
     def read_block(self, stream):
         sentence = []
@@ -287,7 +289,7 @@ class IPIPANCorpusView(StreamBackedCorpusView):
                 self._seek(stream)
                 lines = self._read_data(stream)
 
-            if lines == [""]:
+            if lines == ['']:
                 assert not sentences
                 return []
 
@@ -298,14 +300,14 @@ class IPIPANCorpusView(StreamBackedCorpusView):
                 self.in_sentence = True
             elif line.startswith('<chunk type="p"'):
                 pass
-            elif line.startswith("<tok"):
+            elif line.startswith('<tok'):
                 if self.append_space and space and not no_space:
                     self._append_space(sentence)
                 space = True
                 no_space = False
                 orth = ""
                 tags = set()
-            elif line.startswith("</chunk"):
+            elif line.startswith('</chunk'):
                 if self.in_sentence:
                     self.in_sentence = False
                     self._seek(stream)
@@ -320,39 +322,39 @@ class IPIPANCorpusView(StreamBackedCorpusView):
                 elif self.mode == self.PARAS_MODE:
                     self._seek(stream)
                     return [sentences]
-            elif line.startswith("<orth"):
+            elif line.startswith('<orth'):
                 orth = line[6:-7]
                 if self.replace_xmlentities:
-                    orth = orth.replace("&quot;", '"').replace("&amp;", "&")
-            elif line.startswith("<lex"):
-                if not self.disamb_only or line.find("disamb=") != -1:
-                    tag = line[line.index("<ctag") + 6 : line.index("</ctag")]
+                    orth = orth.replace('&quot;', '"').replace('&amp;', '&')
+            elif line.startswith('<lex'):
+                if not self.disamb_only or line.find('disamb=') != -1:
+                    tag = line[line.index('<ctag') + 6 : line.index('</ctag')]
                     tags.add(tag)
-            elif line.startswith("</tok"):
+            elif line.startswith('</tok'):
                 if self.show_tags:
                     if self.simplify_tags:
-                        tags = [t.split(":")[0] for t in tags]
+                        tags = [t.split(':')[0] for t in tags]
                     if not self.one_tag or not self.disamb_only:
                         sentence.append((orth, tuple(tags)))
                     else:
                         sentence.append((orth, tags.pop()))
                 else:
                     sentence.append(orth)
-            elif line.startswith("<ns/>"):
+            elif line.startswith('<ns/>'):
                 if self.append_space:
                     no_space = True
                 if self.append_no_space:
                     if self.show_tags:
-                        sentence.append(("", "no-space"))
+                        sentence.append(('', 'no-space'))
                     else:
-                        sentence.append("")
-            elif line.startswith("</cesAna"):
+                        sentence.append('')
+            elif line.startswith('</cesAna'):
                 pass
 
     def _read_data(self, stream):
         self.position = stream.tell()
         buff = stream.read(4096)
-        lines = buff.split("\n")
+        lines = buff.split('\n')
         lines.reverse()
         return lines
 
@@ -361,6 +363,6 @@ class IPIPANCorpusView(StreamBackedCorpusView):
 
     def _append_space(self, sentence):
         if self.show_tags:
-            sentence.append((" ", "space"))
+            sentence.append((' ', 'space'))
         else:
-            sentence.append(" ")
+            sentence.append(' ')
index 965a6fe..741d7c6 100644 (file)
@@ -1,13 +1,15 @@
 #! /usr/bin/env python
 # KNB Corpus reader
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Masato Hagiwara <hagisan@gmail.com>
 # URL: <http://nltk.org/>
 # For license information, see LICENSE.TXT
 
 # For more information, see http://lilyx.net/pages/nltkjapanesecorpus.html
+from __future__ import print_function
 
 import re
+from six import string_types
 
 from nltk.parse import DependencyGraph
 
@@ -19,7 +21,7 @@ from nltk.corpus.reader.util import (
 from nltk.corpus.reader.api import SyntaxCorpusReader, CorpusReader
 
 # default function to convert morphlist to str for tree representation
-_morphs2str_default = lambda morphs: "/".join(m[0] for m in morphs if m[0] != "EOS")
+_morphs2str_default = lambda morphs: '/'.join(m[0] for m in morphs if m[0] != 'EOS')
 
 
 class KNBCorpusReader(SyntaxCorpusReader):
@@ -54,7 +56,7 @@ class KNBCorpusReader(SyntaxCorpusReader):
 
     """
 
-    def __init__(self, root, fileids, encoding="utf8", morphs2str=_morphs2str_default):
+    def __init__(self, root, fileids, encoding='utf8', morphs2str=_morphs2str_default):
         """
         Initialize KNBCorpusReader
         morphs2str is a function to convert morphlist to str for tree representation
@@ -87,7 +89,7 @@ class KNBCorpusReader(SyntaxCorpusReader):
             if not re.match(r"EOS|\*|\#|\+", line):
                 cells = line.strip().split(" ")
                 # convert cells to morph tuples
-                res.append((cells[0], " ".join(cells[1:])))
+                res.append((cells[0], ' '.join(cells[1:])))
 
         return res
 
@@ -95,7 +97,7 @@ class KNBCorpusReader(SyntaxCorpusReader):
         dg = DependencyGraph()
         i = 0
         for line in t.splitlines():
-            if line[0] in "*+":
+            if line[0] in '*+':
                 # start of bunsetsu or tag
 
                 cells = line.strip().split(" ", 3)
@@ -104,26 +106,26 @@ class KNBCorpusReader(SyntaxCorpusReader):
                 assert m is not None
 
                 node = dg.nodes[i]
-                node.update({"address": i, "rel": m.group(2), "word": []})
+                node.update({'address': i, 'rel': m.group(2), 'word': []})
 
                 dep_parent = int(m.group(1))
 
                 if dep_parent == -1:
                     dg.root = node
                 else:
-                    dg.nodes[dep_parent]["deps"].append(i)
+                    dg.nodes[dep_parent]['deps'].append(i)
 
                 i += 1
-            elif line[0] != "#":
+            elif line[0] != '#':
                 # normal morph
                 cells = line.strip().split(" ")
                 # convert cells to morph tuples
-                morph = cells[0], " ".join(cells[1:])
-                dg.nodes[i - 1]["word"].append(morph)
+                morph = cells[0], ' '.join(cells[1:])
+                dg.nodes[i - 1]['word'].append(morph)
 
         if self.morphs2str:
             for node in dg.nodes.values():
-                node["word"] = self.morphs2str(node["word"])
+                node['word'] = self.morphs2str(node['word'])
 
         return dg.tree()
 
@@ -138,7 +140,7 @@ def demo():
     import nltk
     from nltk.corpus.util import LazyCorpusLoader
 
-    root = nltk.data.find("corpora/knbc/corpus1")
+    root = nltk.data.find('corpora/knbc/corpus1')
     fileids = [
         f
         for f in find_corpus_fileids(FileSystemPathPointer(root), ".*")
@@ -146,30 +148,30 @@ def demo():
     ]
 
     def _knbc_fileids_sort(x):
-        cells = x.split("-")
+        cells = x.split('-')
         return (cells[0], int(cells[1]), int(cells[2]), int(cells[3]))
 
     knbc = LazyCorpusLoader(
-        "knbc/corpus1",
+        'knbc/corpus1',
         KNBCorpusReader,
         sorted(fileids, key=_knbc_fileids_sort),
-        encoding="euc-jp",
+        encoding='euc-jp',
     )
 
     print(knbc.fileids()[:10])
-    print("".join(knbc.words()[:100]))
+    print(''.join(knbc.words()[:100]))
 
-    print("\n\n".join(str(tree) for tree in knbc.parsed_sents()[:2]))
+    print('\n\n'.join(str(tree) for tree in knbc.parsed_sents()[:2]))
 
-    knbc.morphs2str = lambda morphs: "/".join(
-        "%s(%s)" % (m[0], m[1].split(" ")[2]) for m in morphs if m[0] != "EOS"
-    ).encode("utf-8")
+    knbc.morphs2str = lambda morphs: '/'.join(
+        "%s(%s)" % (m[0], m[1].split(' ')[2]) for m in morphs if m[0] != 'EOS'
+    ).encode('utf-8')
 
-    print("\n\n".join("%s" % tree for tree in knbc.parsed_sents()[:2]))
+    print('\n\n'.join('%s' % tree for tree in knbc.parsed_sents()[:2]))
 
     print(
-        "\n".join(
-            " ".join("%s/%s" % (w[0], w[1].split(" ")[2]) for w in sent)
+        '\n'.join(
+            ' '.join("%s/%s" % (w[0], w[1].split(' ')[2]) for w in sent)
             for sent in knbc.tagged_sents()[0:2]
         )
     )
@@ -180,13 +182,13 @@ def test():
     from nltk.corpus.util import LazyCorpusLoader
 
     knbc = LazyCorpusLoader(
-        "knbc/corpus1", KNBCorpusReader, r".*/KN.*", encoding="euc-jp"
+        'knbc/corpus1', KNBCorpusReader, r'.*/KN.*', encoding='euc-jp'
     )
-    assert isinstance(knbc.words()[0], str)
-    assert isinstance(knbc.sents()[0][0], str)
+    assert isinstance(knbc.words()[0], string_types)
+    assert isinstance(knbc.sents()[0][0], string_types)
     assert isinstance(knbc.tagged_words()[0], tuple)
     assert isinstance(knbc.tagged_sents()[0][0], tuple)
 
 
-if __name__ == "__main__":
+if __name__ == '__main__':
     demo()
index 613a275..493b1b0 100644 (file)
@@ -1,9 +1,10 @@
 # Natural Language Toolkit: Lin's Thesaurus
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Dan Blanchard <dblanchard@ets.org>
 # URL: <http://nltk.org/>
 # For license information, see LICENSE.txt
+from __future__ import print_function
 
 import re
 from collections import defaultdict
@@ -21,20 +22,20 @@ class LinThesaurusCorpusReader(CorpusReader):
 
     @staticmethod
     def __defaultdict_factory():
-        """ Factory for creating defaultdict of defaultdict(dict)s """
+        ''' Factory for creating defaultdict of defaultdict(dict)s '''
         return defaultdict(dict)
 
     def __init__(self, root, badscore=0.0):
-        """
+        '''
         Initialize the thesaurus.
 
         :param root: root directory containing thesaurus LISP files
         :type root: C{string}
         :param badscore: the score to give to words which do not appear in each other's sets of synonyms
         :type badscore: C{float}
-        """
+        '''
 
-        super(LinThesaurusCorpusReader, self).__init__(root, r"sim[A-Z]\.lsp")
+        super(LinThesaurusCorpusReader, self).__init__(root, r'sim[A-Z]\.lsp')
         self._thesaurus = defaultdict(LinThesaurusCorpusReader.__defaultdict_factory)
         self._badscore = badscore
         for path, encoding, fileid in self.abspaths(
@@ -46,14 +47,14 @@ class LinThesaurusCorpusReader(CorpusReader):
                     line = line.strip()
                     # Start of entry
                     if first:
-                        key = LinThesaurusCorpusReader._key_re.sub(r"\1", line)
+                        key = LinThesaurusCorpusReader._key_re.sub(r'\1', line)
                         first = False
                     # End of entry
-                    elif line == "))":
+                    elif line == '))':
                         first = True
                     # Lines with pairs of ngrams and scores
                     else:
-                        split_line = line.split("\t")
+                        split_line = line.split('\t')
                         if len(split_line) == 2:
                             ngram, score = split_line
                             self._thesaurus[fileid][key][ngram.strip('"')] = float(
@@ -61,7 +62,7 @@ class LinThesaurusCorpusReader(CorpusReader):
                             )
 
     def similarity(self, ngram1, ngram2, fileid=None):
-        """
+        '''
         Returns the similarity score for two ngrams.
 
         :param ngram1: first ngram to compare
@@ -72,7 +73,7 @@ class LinThesaurusCorpusReader(CorpusReader):
         :type fileid: C{string}
         :return: If fileid is specified, just the score for the two ngrams; otherwise,
                  list of tuples of fileids and scores.
-        """
+        '''
         # Entries don't contain themselves, so make sure similarity between item and itself is 1.0
         if ngram1 == ngram2:
             if fileid:
@@ -100,7 +101,7 @@ class LinThesaurusCorpusReader(CorpusReader):
                 ]
 
     def scored_synonyms(self, ngram, fileid=None):
-        """
+        '''
         Returns a list of scored synonyms (tuples of synonyms and scores) for the current ngram
 
         :param ngram: ngram to lookup
@@ -110,7 +111,7 @@ class LinThesaurusCorpusReader(CorpusReader):
         :return: If fileid is specified, list of tuples of scores and synonyms; otherwise,
                  list of tuples of fileids and lists, where inner lists consist of tuples of
                  scores and synonyms.
-        """
+        '''
         if fileid:
             return self._thesaurus[fileid][ngram].items()
         else:
@@ -120,7 +121,7 @@ class LinThesaurusCorpusReader(CorpusReader):
             ]
 
     def synonyms(self, ngram, fileid=None):
-        """
+        '''
         Returns a list of synonyms for the current ngram.
 
         :param ngram: ngram to lookup
@@ -129,7 +130,7 @@ class LinThesaurusCorpusReader(CorpusReader):
         :type fileid: C{string}
         :return: If fileid is specified, list of synonyms; otherwise, list of tuples of fileids and
                  lists, where inner lists contain synonyms.
-        """
+        '''
         if fileid:
             return self._thesaurus[fileid][ngram].keys()
         else:
@@ -139,13 +140,13 @@ class LinThesaurusCorpusReader(CorpusReader):
             ]
 
     def __contains__(self, ngram):
-        """
+        '''
         Determines whether or not the given ngram is in the thesaurus.
 
         :param ngram: ngram to lookup
         :type ngram: C{string}
         :return: whether the given ngram is in the thesaurus.
-        """
+        '''
         return reduce(
             lambda accum, fileid: accum or (ngram in self._thesaurus[fileid]),
             self._fileids,
@@ -179,5 +180,5 @@ def demo():
     print(thes.similarity(word1, word2))
 
 
-if __name__ == "__main__":
+if __name__ == '__main__':
     demo()
index 085f257..4198d3f 100644 (file)
@@ -5,6 +5,8 @@ import os
 import re
 from functools import reduce
 
+from six import string_types
+
 from nltk.corpus.reader import concat, TaggedCorpusReader
 from nltk.corpus.reader.xmldocs import XMLCorpusView
 
@@ -38,11 +40,11 @@ class MTEFileReader:
     """
 
     ns = {
-        "tei": "http://www.tei-c.org/ns/1.0",
-        "xml": "http://www.w3.org/XML/1998/namespace",
+        'tei': 'http://www.tei-c.org/ns/1.0',
+        'xml': 'http://www.w3.org/XML/1998/namespace',
     }
-    tag_ns = "{http://www.tei-c.org/ns/1.0}"
-    xml_ns = "{http://www.w3.org/XML/1998/namespace}"
+    tag_ns = '{http://www.tei-c.org/ns/1.0}'
+    xml_ns = '{http://www.w3.org/XML/1998/namespace}'
     word_path = "TEI/text/body/div/div/p/s/(w|c)"
     sent_path = "TEI/text/body/div/div/p/s"
     para_path = "TEI/text/body/div/div/p"
@@ -56,30 +58,30 @@ class MTEFileReader:
 
     @classmethod
     def _sent_elt(cls, elt, context):
-        return [cls._word_elt(w, None) for w in xpath(elt, "*", cls.ns)]
+        return [cls._word_elt(w, None) for w in xpath(elt, '*', cls.ns)]
 
     @classmethod
     def _para_elt(cls, elt, context):
-        return [cls._sent_elt(s, None) for s in xpath(elt, "*", cls.ns)]
+        return [cls._sent_elt(s, None) for s in xpath(elt, '*', cls.ns)]
 
     @classmethod
     def _tagged_word_elt(cls, elt, context):
-        if "ana" not in elt.attrib:
-            return (elt.text, "")
+        if 'ana' not in elt.attrib:
+            return (elt.text, '')
 
         if cls.__tags == "" and cls.__tagset == "msd":
-            return (elt.text, elt.attrib["ana"])
+            return (elt.text, elt.attrib['ana'])
         elif cls.__tags == "" and cls.__tagset == "universal":
-            return (elt.text, MTETagConverter.msd_to_universal(elt.attrib["ana"]))
+            return (elt.text, MTETagConverter.msd_to_universal(elt.attrib['ana']))
         else:
-            tags = re.compile("^" + re.sub("-", ".", cls.__tags) + ".*$")
-            if tags.match(elt.attrib["ana"]):
+            tags = re.compile('^' + re.sub("-", ".", cls.__tags) + '.*$')
+            if tags.match(elt.attrib['ana']):
                 if cls.__tagset == "msd":
-                    return (elt.text, elt.attrib["ana"])
+                    return (elt.text, elt.attrib['ana'])
                 else:
                     return (
                         elt.text,
-                        MTETagConverter.msd_to_universal(elt.attrib["ana"]),
+                        MTETagConverter.msd_to_universal(elt.attrib['ana']),
                     )
             else:
                 return None
@@ -89,7 +91,7 @@ class MTEFileReader:
         return list(
             filter(
                 lambda x: x is not None,
-                [cls._tagged_word_elt(w, None) for w in xpath(elt, "*", cls.ns)],
+                [cls._tagged_word_elt(w, None) for w in xpath(elt, '*', cls.ns)],
             )
         )
 
@@ -98,24 +100,24 @@ class MTEFileReader:
         return list(
             filter(
                 lambda x: x is not None,
-                [cls._tagged_sent_elt(s, None) for s in xpath(elt, "*", cls.ns)],
+                [cls._tagged_sent_elt(s, None) for s in xpath(elt, '*', cls.ns)],
             )
         )
 
     @classmethod
     def _lemma_word_elt(cls, elt, context):
-        if "lemma" not in elt.attrib:
-            return (elt.text, "")
+        if 'lemma' not in elt.attrib:
+            return (elt.text, '')
         else:
-            return (elt.text, elt.attrib["lemma"])
+            return (elt.text, elt.attrib['lemma'])
 
     @classmethod
     def _lemma_sent_elt(cls, elt, context):
-        return [cls._lemma_word_elt(w, None) for w in xpath(elt, "*", cls.ns)]
+        return [cls._lemma_word_elt(w, None) for w in xpath(elt, '*', cls.ns)]
 
     @classmethod
     def _lemma_para_elt(cls, elt, context):
-        return [cls._lemma_sent_elt(s, None) for s in xpath(elt, "*", cls.ns)]
+        return [cls._lemma_sent_elt(s, None) for s in xpath(elt, '*', cls.ns)]
 
     def words(self):
         return MTECorpusView(
@@ -176,18 +178,18 @@ class MTETagConverter:
     """
 
     mapping_msd_universal = {
-        "A": "ADJ",
-        "S": "ADP",
-        "R": "ADV",
-        "C": "CONJ",
-        "D": "DET",
-        "N": "NOUN",
-        "M": "NUM",
-        "Q": "PRT",
-        "P": "PRON",
-        "V": "VERB",
-        ".": ".",
-        "-": "X",
+        'A': 'ADJ',
+        'S': 'ADP',
+        'R': 'ADV',
+        'C': 'CONJ',
+        'D': 'DET',
+        'N': 'NOUN',
+        'M': 'NUM',
+        'Q': 'PRT',
+        'P': 'PRON',
+        'V': 'VERB',
+        '.': '.',
+        '-': 'X',
     }
 
     @staticmethod
@@ -201,7 +203,7 @@ class MTETagConverter:
         indicator = tag[0] if not tag[0] == "#" else tag[1]
 
         if not indicator in MTETagConverter.mapping_msd_universal:
-            indicator = "-"
+            indicator = '-'
 
         return MTETagConverter.mapping_msd_universal[indicator]
 
@@ -213,7 +215,7 @@ class MTECorpusReader(TaggedCorpusReader):
     scheme. These tags can be converted to the Universal tagset
     """
 
-    def __init__(self, root=None, fileids=None, encoding="utf8"):
+    def __init__(self, root=None, fileids=None, encoding='utf8'):
         """
         Construct a new MTECorpusreader for a set of documents
         located at the given root directory.  Example usage:
@@ -230,7 +232,7 @@ class MTECorpusReader(TaggedCorpusReader):
     def __fileids(self, fileids):
         if fileids is None:
             fileids = self._fileids
-        elif isinstance(fileids, str):
+        elif isinstance(fileids, string_types):
             fileids = [fileids]
         # filter wrong userinput
         fileids = filter(lambda x: x in self._fileids, fileids)
index 23be4b6..aea84b0 100644 (file)
@@ -1,6 +1,6 @@
 # Natural Language Toolkit: NKJP Corpus Reader
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Gabriela Kaczka
 # URL: <http://nltk.org/>
 # For license information, see LICENSE.TXT
@@ -10,6 +10,8 @@ import os
 import re
 import tempfile
 
+from six import string_types
+
 from nltk.corpus.reader.util import concat
 from nltk.corpus.reader.xmldocs import XMLCorpusReader, XMLCorpusView
 
@@ -35,7 +37,7 @@ class NKJPCorpusReader(XMLCorpusReader):
     HEADER_MODE = 2
     RAW_MODE = 3
 
-    def __init__(self, root, fileids=".*"):
+    def __init__(self, root, fileids='.*'):
         """
         Corpus reader designed to work with National Corpus of Polish.
         See http://nkjp.pl/ for more details about NKJP.
@@ -53,11 +55,11 @@ class NKJPCorpusReader(XMLCorpusReader):
         x.header(fileids=['WilkDom', '/home/USER/nltk_data/corpora/nkjp/WilkWilczy'])
         x.tagged_words(fileids=['WilkDom', '/home/USER/nltk_data/corpora/nkjp/WilkWilczy'], tags=['subst', 'comp'])
         """
-        if isinstance(fileids, str):
-            XMLCorpusReader.__init__(self, root, fileids + ".*/header.xml")
+        if isinstance(fileids, string_types):
+            XMLCorpusReader.__init__(self, root, fileids + '.*/header.xml')
         else:
             XMLCorpusReader.__init__(
-                self, root, [fileid + "/header.xml" for fileid in fileids]
+                self, root, [fileid + '/header.xml' for fileid in fileids]
             )
         self._paths = self.get_paths()
 
@@ -78,7 +80,7 @@ class NKJPCorpusReader(XMLCorpusReader):
         """
         Returns a view specialised for use with particular corpus file.
         """
-        mode = kwargs.pop("mode", NKJPCorpusReader.WORDS_MODE)
+        mode = kwargs.pop('mode', NKJPCorpusReader.WORDS_MODE)
         if mode is NKJPCorpusReader.WORDS_MODE:
             return NKJPCorpus_Morph_View(filename, tags=tags)
         elif mode is NKJPCorpusReader.SENTS_MODE:
@@ -91,7 +93,7 @@ class NKJPCorpusReader(XMLCorpusReader):
             )
 
         else:
-            raise NameError("No such mode!")
+            raise NameError('No such mode!')
 
     def add_root(self, fileid):
         """
@@ -150,7 +152,7 @@ class NKJPCorpusReader(XMLCorpusReader):
         Call with specified tags as a list, e.g. tags=['subst', 'comp'].
         Returns tagged words in specified fileids.
         """
-        tags = kwargs.pop("tags", [])
+        tags = kwargs.pop('tags', [])
         return concat(
             [
                 self._view(
@@ -186,7 +188,7 @@ class NKJPCorpus_Header_View(XMLCorpusView):
         header.xml files in NKJP corpus.
         """
         self.tagspec = ".*/sourceDesc$"
-        XMLCorpusView.__init__(self, filename + "header.xml", self.tagspec)
+        XMLCorpusView.__init__(self, filename + 'header.xml', self.tagspec)
 
     def handle_query(self):
         self._open()
@@ -200,43 +202,43 @@ class NKJPCorpus_Header_View(XMLCorpusView):
         return header
 
     def handle_elt(self, elt, context):
-        titles = elt.findall("bibl/title")
+        titles = elt.findall('bibl/title')
         title = []
         if titles:
-            title = "\n".join(title.text.strip() for title in titles)
+            title = '\n'.join(title.text.strip() for title in titles)
 
-        authors = elt.findall("bibl/author")
+        authors = elt.findall('bibl/author')
         author = []
         if authors:
-            author = "\n".join(author.text.strip() for author in authors)
+            author = '\n'.join(author.text.strip() for author in authors)
 
-        dates = elt.findall("bibl/date")
+        dates = elt.findall('bibl/date')
         date = []
         if dates:
-            date = "\n".join(date.text.strip() for date in dates)
+            date = '\n'.join(date.text.strip() for date in dates)
 
-        publishers = elt.findall("bibl/publisher")
+        publishers = elt.findall('bibl/publisher')
         publisher = []
         if publishers:
-            publisher = "\n".join(publisher.text.strip() for publisher in publishers)
+            publisher = '\n'.join(publisher.text.strip() for publisher in publishers)
 
-        idnos = elt.findall("bibl/idno")
+        idnos = elt.findall('bibl/idno')
         idno = []
         if idnos:
-            idno = "\n".join(idno.text.strip() for idno in idnos)
+            idno = '\n'.join(idno.text.strip() for idno in idnos)
 
-        notes = elt.findall("bibl/note")
+        notes = elt.findall('bibl/note')
         note = []
         if notes:
-            note = "\n".join(note.text.strip() for note in notes)
+            note = '\n'.join(note.text.strip() for note in notes)
 
         return {
-            "title": title,
-            "author": author,
-            "date": date,
-            "publisher": publisher,
-            "idno": idno,
-            "note": note,
+            'title': title,
+            'author': author,
+            'date': date,
+            'publisher': publisher,
+            'idno': idno,
+            'note': note,
         }
 
 
@@ -253,21 +255,21 @@ class XML_Tool:
 
     def build_preprocessed_file(self):
         try:
-            fr = open(self.read_file, "r")
+            fr = open(self.read_file, 'r')
             fw = self.write_file
-            line = " "
+            line = ' '
             while len(line):
                 line = fr.readline()
-                x = re.split(r"nkjp:[^ ]* ", line)  # in all files
-                ret = " ".join(x)
-                x = re.split("<nkjp:paren>", ret)  # in ann_segmentation.xml
-                ret = " ".join(x)
-                x = re.split("</nkjp:paren>", ret)  # in ann_segmentation.xml
-                ret = " ".join(x)
-                x = re.split("<choice>", ret)  # in ann_segmentation.xml
-                ret = " ".join(x)
-                x = re.split("</choice>", ret)  # in ann_segmentation.xml
-                ret = " ".join(x)
+                x = re.split(r'nkjp:[^ ]* ', line)  # in all files
+                ret = ' '.join(x)
+                x = re.split('<nkjp:paren>', ret)  # in ann_segmentation.xml
+                ret = ' '.join(x)
+                x = re.split('</nkjp:paren>', ret)  # in ann_segmentation.xml
+                ret = ' '.join(x)
+                x = re.split('<choice>', ret)  # in ann_segmentation.xml
+                ret = ' '.join(x)
+                x = re.split('</choice>', ret)  # in ann_segmentation.xml
+                ret = ' '.join(x)
                 fw.write(ret)
             fr.close()
             fw.close()
@@ -287,29 +289,29 @@ class NKJPCorpus_Segmentation_View(XMLCorpusView):
     """
 
     def __init__(self, filename, **kwargs):
-        self.tagspec = ".*p/.*s"
+        self.tagspec = '.*p/.*s'
         # intersperse NKJPCorpus_Text_View
         self.text_view = NKJPCorpus_Text_View(
             filename, mode=NKJPCorpus_Text_View.SENTS_MODE
         )
         self.text_view.handle_query()
         # xml preprocessing
-        self.xml_tool = XML_Tool(filename, "ann_segmentation.xml")
+        self.xml_tool = XML_Tool(filename, 'ann_segmentation.xml')
         # base class init
         XMLCorpusView.__init__(
             self, self.xml_tool.build_preprocessed_file(), self.tagspec
         )
 
     def get_segm_id(self, example_word):
-        return example_word.split("(")[1].split(",")[0]
+        return example_word.split('(')[1].split(',')[0]
 
     def get_sent_beg(self, beg_word):
         # returns index of beginning letter in sentence
-        return int(beg_word.split(",")[1])
+        return int(beg_word.split(',')[1])
 
     def get_sent_end(self, end_word):
         # returns index of end letter in sentence
-        splitted = end_word.split(")")[0].split(",")
+        splitted = end_word.split(')')[0].split(',')
         return int(splitted[1]) + int(splitted[2])
 
     def get_sentences(self, sent_segm):
@@ -355,7 +357,7 @@ class NKJPCorpus_Segmentation_View(XMLCorpusView):
     def handle_elt(self, elt, context):
         ret = []
         for seg in elt:
-            ret.append(seg.get("corresp"))
+            ret.append(seg.get('corresp'))
         return ret
 
 
@@ -369,11 +371,11 @@ class NKJPCorpus_Text_View(XMLCorpusView):
     RAW_MODE = 1
 
     def __init__(self, filename, **kwargs):
-        self.mode = kwargs.pop("mode", 0)
-        self.tagspec = ".*/div/ab"
+        self.mode = kwargs.pop('mode', 0)
+        self.tagspec = '.*/div/ab'
         self.segm_dict = dict()
         # xml preprocessing
-        self.xml_tool = XML_Tool(filename, "text.xml")
+        self.xml_tool = XML_Tool(filename, 'text.xml')
         # base class init
         XMLCorpusView.__init__(
             self, self.xml_tool.build_preprocessed_file(), self.tagspec
@@ -402,11 +404,11 @@ class NKJPCorpus_Text_View(XMLCorpusView):
             for part in segm:
                 txt.append(part)
 
-        return [" ".join([segm for segm in txt])]
+        return [' '.join([segm for segm in txt])]
 
     def get_segm_id(self, elt):
         for attr in elt.attrib:
-            if attr.endswith("id"):
+            if attr.endswith('id'):
                 return elt.get(attr)
 
     def handle_elt(self, elt, context):
@@ -423,9 +425,9 @@ class NKJPCorpus_Morph_View(XMLCorpusView):
     """
 
     def __init__(self, filename, **kwargs):
-        self.tags = kwargs.pop("tags", None)
-        self.tagspec = ".*/seg/fs"
-        self.xml_tool = XML_Tool(filename, "ann_morphosyntax.xml")
+        self.tags = kwargs.pop('tags', None)
+        self.tagspec = '.*/seg/fs'
+        self.xml_tool = XML_Tool(filename, 'ann_morphosyntax.xml')
         XMLCorpusView.__init__(
             self, self.xml_tool.build_preprocessed_file(), self.tagspec
         )
@@ -449,7 +451,7 @@ class NKJPCorpus_Morph_View(XMLCorpusView):
             raise Exception
 
     def handle_elt(self, elt, context):
-        word = ""
+        word = ''
         flag = False
         is_not_interp = True
         # if tags not specified, then always return word
@@ -459,28 +461,28 @@ class NKJPCorpus_Morph_View(XMLCorpusView):
         for child in elt:
 
             # get word
-            if "name" in child.keys() and child.attrib["name"] == "orth":
+            if 'name' in child.keys() and child.attrib['name'] == 'orth':
                 for symbol in child:
-                    if symbol.tag == "string":
+                    if symbol.tag == 'string':
                         word = symbol.text
-            elif "name" in child.keys() and child.attrib["name"] == "interps":
+            elif 'name' in child.keys() and child.attrib['name'] == 'interps':
                 for symbol in child:
-                    if "type" in symbol.keys() and symbol.attrib["type"] == "lex":
+                    if 'type' in symbol.keys() and symbol.attrib['type'] == 'lex':
                         for symbol2 in symbol:
                             if (
-                                "name" in symbol2.keys()
-                                and symbol2.attrib["name"] == "ctag"
+                                'name' in symbol2.keys()
+                                and symbol2.attrib['name'] == 'ctag'
                             ):
                                 for symbol3 in symbol2:
                                     if (
-                                        "value" in symbol3.keys()
+                                        'value' in symbol3.keys()
                                         and self.tags is not None
-                                        and symbol3.attrib["value"] in self.tags
+                                        and symbol3.attrib['value'] in self.tags
                                     ):
                                         flag = True
                                     elif (
-                                        "value" in symbol3.keys()
-                                        and symbol3.attrib["value"] == "interp"
+                                        'value' in symbol3.keys()
+                                        and symbol3.attrib['value'] == 'interp'
                                     ):
                                         is_not_interp = False
         if flag and is_not_interp:
index 06740d0..603646a 100644 (file)
@@ -1,16 +1,20 @@
 # Natural Language Toolkit: NomBank Corpus Reader
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Authors: Paul Bedaride <paul.bedaride@gmail.com>
 #          Edward Loper <edloper@gmail.com>
 # URL: <http://nltk.org/>
 # For license information, see LICENSE.TXT
 
+from __future__ import unicode_literals
 from xml.etree import ElementTree
 from functools import total_ordering
 
+from six import string_types
+
 from nltk.tree import Tree
 from nltk.internals import raise_unorderable_types
+from nltk.compat import python_2_unicode_compatible
 
 from nltk.corpus.reader.util import *
 from nltk.corpus.reader.api import *
@@ -34,11 +38,11 @@ class NombankCorpusReader(CorpusReader):
         self,
         root,
         nomfile,
-        framefiles="",
+        framefiles='',
         nounsfile=None,
         parse_fileid_xform=None,
         parse_corpus=None,
-        encoding="utf8",
+        encoding='utf8',
     ):
         """
         :param root: The root directory for this corpus.
@@ -54,16 +58,16 @@ class NombankCorpusReader(CorpusReader):
             corresponding to this corpus.  These parse trees are
             necessary to resolve the tree pointers used by nombank.
         """
-
         # If framefiles is specified as a regexp, expand it.
-        if isinstance(framefiles, str):
-            self._fileids = find_corpus_fileids(root, framefiles)
-        self._fileids = list(framefiles)
+        if isinstance(framefiles, string_types):
+            framefiles = find_corpus_fileids(root, framefiles)
+        framefiles = list(framefiles)
         # Initialze the corpus reader.
-        CorpusReader.__init__(self, root, framefiles, encoding)
+        CorpusReader.__init__(self, root, [nomfile, nounsfile] + framefiles, encoding)
 
-        # Record our nom file & nouns file.
+        # Record our frame fileids & nom file.
         self._nomfile = nomfile
+        self._framefiles = framefiles
         self._nounsfile = nounsfile
         self._parse_fileid_xform = parse_fileid_xform
         self._parse_corpus = parse_corpus
@@ -74,7 +78,7 @@ class NombankCorpusReader(CorpusReader):
         """
         if fileids is None:
             fileids = self._fileids
-        elif isinstance(fileids, str):
+        elif isinstance(fileids, string_types):
             fileids = [fileids]
         return concat([self.open(f).read() for f in fileids])
 
@@ -85,7 +89,7 @@ class NombankCorpusReader(CorpusReader):
         """
         kwargs = {}
         if baseform is not None:
-            kwargs["instance_filter"] = lambda inst: inst.baseform == baseform
+            kwargs['instance_filter'] = lambda inst: inst.baseform == baseform
         return StreamBackedCorpusView(
             self.abspath(self._nomfile),
             lambda stream: self._read_instance_block(stream, **kwargs),
@@ -107,41 +111,41 @@ class NombankCorpusReader(CorpusReader):
         """
         :return: the xml description for the given roleset.
         """
-        baseform = roleset_id.split(".")[0]
-        baseform = baseform.replace("perc-sign", "%")
-        baseform = baseform.replace("oneslashonezero", "1/10").replace(
-            "1/10", "1-slash-10"
+        baseform = roleset_id.split('.')[0]
+        baseform = baseform.replace('perc-sign', '%')
+        baseform = baseform.replace('oneslashonezero', '1/10').replace(
+            '1/10', '1-slash-10'
         )
-        framefile = "frames/%s.xml" % baseform
-        if framefile not in self.fileids():
-            raise ValueError("Frameset file for %s not found" % roleset_id)
+        framefile = 'frames/%s.xml' % baseform
+        if framefile not in self._framefiles:
+            raise ValueError('Frameset file for %s not found' % roleset_id)
 
         # n.b.: The encoding for XML fileids is specified by the file
         # itself; so we ignore self._encoding here.
         etree = ElementTree.parse(self.abspath(framefile).open()).getroot()
-        for roleset in etree.findall("predicate/roleset"):
-            if roleset.attrib["id"] == roleset_id:
+        for roleset in etree.findall('predicate/roleset'):
+            if roleset.attrib['id'] == roleset_id:
                 return roleset
-        raise ValueError("Roleset %s not found in %s" % (roleset_id, framefile))
+        raise ValueError('Roleset %s not found in %s' % (roleset_id, framefile))
 
     def rolesets(self, baseform=None):
         """
         :return: list of xml descriptions for rolesets.
         """
         if baseform is not None:
-            framefile = "frames/%s.xml" % baseform
-            if framefile not in self.fileids():
-                raise ValueError("Frameset file for %s not found" % baseform)
+            framefile = 'frames/%s.xml' % baseform
+            if framefile not in self._framefiles:
+                raise ValueError('Frameset file for %s not found' % baseform)
             framefiles = [framefile]
         else:
-            framefiles = self.fileids()
+            framefiles = self._framefiles
 
         rsets = []
         for framefile in framefiles:
             # n.b.: The encoding for XML fileids is specified by the file
             # itself; so we ignore self._encoding here.
             etree = ElementTree.parse(self.abspath(framefile).open()).getroot()
-            rsets.append(etree.findall("predicate/roleset"))
+            rsets.append(etree.findall('predicate/roleset'))
         return LazyConcatenation(rsets)
 
     def nouns(self):
@@ -176,6 +180,7 @@ class NombankCorpusReader(CorpusReader):
 ######################################################################
 
 
+@python_2_unicode_compatible
 class NombankInstance(object):
     def __init__(
         self,
@@ -232,28 +237,28 @@ class NombankInstance(object):
         """The name of the roleset used by this instance's predicate.
         Use ``nombank.roleset() <NombankCorpusReader.roleset>`` to
         look up information about the roleset."""
-        r = self.baseform.replace("%", "perc-sign")
-        r = r.replace("1/10", "1-slash-10").replace("1-slash-10", "oneslashonezero")
-        return "%s.%s" % (r, self.sensenumber)
+        r = self.baseform.replace('%', 'perc-sign')
+        r = r.replace('1/10', '1-slash-10').replace('1-slash-10', 'oneslashonezero')
+        return '%s.%s' % (r, self.sensenumber)
 
     def __repr__(self):
-        return "<NombankInstance: %s, sent %s, word %s>" % (
+        return '<NombankInstance: %s, sent %s, word %s>' % (
             self.fileid,
             self.sentnum,
             self.wordnum,
         )
 
     def __str__(self):
-        s = "%s %s %s %s %s" % (
+        s = '%s %s %s %s %s' % (
             self.fileid,
             self.sentnum,
             self.wordnum,
             self.baseform,
             self.sensenumber,
         )
-        items = self.arguments + ((self.predicate, "rel"),)
+        items = self.arguments + ((self.predicate, 'rel'),)
         for (argloc, argid) in sorted(items):
-            s += " %s-%s" % (argloc, argid)
+            s += ' %s-%s' % (argloc, argid)
         return s
 
     def _get_tree(self):
@@ -274,15 +279,15 @@ class NombankInstance(object):
     def parse(s, parse_fileid_xform=None, parse_corpus=None):
         pieces = s.split()
         if len(pieces) < 6:
-            raise ValueError("Badly formatted nombank line: %r" % s)
+            raise ValueError('Badly formatted nombank line: %r' % s)
 
         # Divide the line into its basic pieces.
         (fileid, sentnum, wordnum, baseform, sensenumber) = pieces[:5]
 
         args = pieces[5:]
-        rel = [args.pop(i) for i, p in enumerate(args) if "-rel" in p]
+        rel = [args.pop(i) for i, p in enumerate(args) if '-rel' in p]
         if len(rel) != 1:
-            raise ValueError("Badly formatted nombank line: %r" % s)
+            raise ValueError('Badly formatted nombank line: %r' % s)
 
         # Apply the fileid selector, if any.
         if parse_fileid_xform is not None:
@@ -294,13 +299,13 @@ class NombankInstance(object):
 
         # Parse the predicate location.
 
-        predloc, predid = rel[0].split("-", 1)
+        predloc, predid = rel[0].split('-', 1)
         predicate = NombankTreePointer.parse(predloc)
 
         # Parse the arguments.
         arguments = []
         for arg in args:
-            argloc, argid = arg.split("-", 1)
+            argloc, argid = arg.split('-', 1)
             arguments.append((NombankTreePointer.parse(argloc), argid))
 
         # Put it all together.
@@ -337,6 +342,7 @@ class NombankPointer(object):
             raise NotImplementedError()
 
 
+@python_2_unicode_compatible
 class NombankChainTreePointer(NombankPointer):
     def __init__(self, pieces):
         self.pieces = pieces
@@ -345,17 +351,18 @@ class NombankChainTreePointer(NombankPointer):
            ``NombankTreePointer`` pointers."""
 
     def __str__(self):
-        return "*".join("%s" % p for p in self.pieces)
+        return '*'.join('%s' % p for p in self.pieces)
 
     def __repr__(self):
-        return "<NombankChainTreePointer: %s>" % self
+        return '<NombankChainTreePointer: %s>' % self
 
     def select(self, tree):
         if tree is None:
-            raise ValueError("Parse tree not avaialable")
-        return Tree("*CHAIN*", [p.select(tree) for p in self.pieces])
+            raise ValueError('Parse tree not avaialable')
+        return Tree('*CHAIN*', [p.select(tree) for p in self.pieces])
 
 
+@python_2_unicode_compatible
 class NombankSplitTreePointer(NombankPointer):
     def __init__(self, pieces):
         self.pieces = pieces
@@ -363,18 +370,19 @@ class NombankSplitTreePointer(NombankPointer):
            all ``NombankTreePointer`` pointers."""
 
     def __str__(self):
-        return ",".join("%s" % p for p in self.pieces)
+        return ','.join('%s' % p for p in self.pieces)
 
     def __repr__(self):
-        return "<NombankSplitTreePointer: %s>" % self
+        return '<NombankSplitTreePointer: %s>' % self
 
     def select(self, tree):
         if tree is None:
-            raise ValueError("Parse tree not avaialable")
-        return Tree("*SPLIT*", [p.select(tree) for p in self.pieces])
+            raise ValueError('Parse tree not avaialable')
+        return Tree('*SPLIT*', [p.select(tree) for p in self.pieces])
 
 
 @total_ordering
+@python_2_unicode_compatible
 class NombankTreePointer(NombankPointer):
     """
     wordnum:height*wordnum:height*...
@@ -389,30 +397,30 @@ class NombankTreePointer(NombankPointer):
     @staticmethod
     def parse(s):
         # Deal with chains (xx*yy*zz)
-        pieces = s.split("*")
+        pieces = s.split('*')
         if len(pieces) > 1:
             return NombankChainTreePointer(
                 [NombankTreePointer.parse(elt) for elt in pieces]
             )
 
         # Deal with split args (xx,yy,zz)
-        pieces = s.split(",")
+        pieces = s.split(',')
         if len(pieces) > 1:
             return NombankSplitTreePointer(
                 [NombankTreePointer.parse(elt) for elt in pieces]
             )
 
         # Deal with normal pointers.
-        pieces = s.split(":")
+        pieces = s.split(':')
         if len(pieces) != 2:
-            raise ValueError("bad nombank pointer %r" % s)
+            raise ValueError('bad nombank pointer %r' % s)
         return NombankTreePointer(int(pieces[0]), int(pieces[1]))
 
     def __str__(self):
-        return "%s:%s" % (self.wordnum, self.height)
+        return '%s:%s' % (self.wordnum, self.height)
 
     def __repr__(self):
-        return "NombankTreePointer(%d, %d)" % (self.wordnum, self.height)
+        return 'NombankTreePointer(%d, %d)' % (self.wordnum, self.height)
 
     def __eq__(self, other):
         while isinstance(other, (NombankChainTreePointer, NombankSplitTreePointer)):
@@ -437,7 +445,7 @@ class NombankTreePointer(NombankPointer):
 
     def select(self, tree):
         if tree is None:
-            raise ValueError("Parse tree not avaialable")
+            raise ValueError('Parse tree not avaialable')
         return tree[self.treepos(tree)]
 
     def treepos(self, tree):
@@ -446,12 +454,14 @@ class NombankTreePointer(NombankPointer):
         given that it points to the given tree.
         """
         if tree is None:
-            raise ValueError("Parse tree not avaialable")
+            raise ValueError('Parse tree not avaialable')
         stack = [tree]
         treepos = []
 
         wordnum = 0
         while True:
+            # print treepos
+            # print stack[-1]
             # tree node:
             if isinstance(stack[-1], Tree):
                 # Select the next child.
index 8dfd8a5..391f61d 100644 (file)
@@ -1,9 +1,10 @@
 # Natural Language Toolkit: NPS Chat Corpus Reader
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Edward Loper <edloper@gmail.com>
 # URL: <http://nltk.org/>
 # For license information, see LICENSE.TXT
+from __future__ import unicode_literals
 
 import re
 import textwrap
@@ -26,14 +27,14 @@ class NPSChatCorpusReader(XMLCorpusReader):
         if self._wrap_etree:
             return concat(
                 [
-                    XMLCorpusView(fileid, "Session/Posts/Post", self._wrap_elt)
+                    XMLCorpusView(fileid, 'Session/Posts/Post', self._wrap_elt)
                     for fileid in self.abspaths(fileids)
                 ]
             )
         else:
             return concat(
                 [
-                    XMLCorpusView(fileid, "Session/Posts/Post")
+                    XMLCorpusView(fileid, 'Session/Posts/Post')
                     for fileid in self.abspaths(fileids)
                 ]
             )
@@ -42,7 +43,7 @@ class NPSChatCorpusReader(XMLCorpusReader):
         return concat(
             [
                 XMLCorpusView(
-                    fileid, "Session/Posts/Post/terminals", self._elt_to_words
+                    fileid, 'Session/Posts/Post/terminals', self._elt_to_words
                 )
                 for fileid in self.abspaths(fileids)
             ]
@@ -54,7 +55,7 @@ class NPSChatCorpusReader(XMLCorpusReader):
 
         return concat(
             [
-                XMLCorpusView(fileid, "Session/Posts/Post/terminals", reader)
+                XMLCorpusView(fileid, 'Session/Posts/Post/terminals', reader)
                 for fileid in self.abspaths(fileids)
             ]
         )
@@ -69,12 +70,12 @@ class NPSChatCorpusReader(XMLCorpusReader):
         return ElementWrapper(elt)
 
     def _elt_to_words(self, elt, handler):
-        return [self._simplify_username(t.attrib["word"]) for t in elt.findall("t")]
+        return [self._simplify_username(t.attrib['word']) for t in elt.findall('t')]
 
     def _elt_to_tagged_words(self, elt, handler, tagset=None):
         tagged_post = [
-            (self._simplify_username(t.attrib["word"]), t.attrib["pos"])
-            for t in elt.findall("t")
+            (self._simplify_username(t.attrib['word']), t.attrib['pos'])
+            for t in elt.findall('t')
         ]
         if tagset and tagset != self._tagset:
             tagged_post = [
@@ -84,8 +85,8 @@ class NPSChatCorpusReader(XMLCorpusReader):
 
     @staticmethod
     def _simplify_username(word):
-        if "User" in word:
-            word = "U" + word.split("User", 1)[1]
+        if 'User' in word:
+            word = 'U' + word.split('User', 1)[1]
         elif isinstance(word, bytes):
-            word = word.decode("ascii")
+            word = word.decode('ascii')
         return word
index 598db32..cfe7f6e 100644 (file)
@@ -1,6 +1,6 @@
 # Natural Language Toolkit: Opinion Lexicon Corpus Reader
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Pierpaolo Pantone <24alsecondo@gmail.com>
 # URL: <http://nltk.org/>
 # For license information, see LICENSE.TXT
@@ -27,6 +27,7 @@ Related papers:
     Comparing Opinions on the Web". Proceedings of the 14th International World
     Wide Web conference (WWW-2005), May 10-14, 2005, Chiba, Japan.
 """
+from six import string_types
 
 from nltk.corpus.reader import WordListCorpusReader
 from nltk.corpus.reader.api import *
@@ -85,7 +86,7 @@ class OpinionLexiconCorpusReader(WordListCorpusReader):
         """
         if fileids is None:
             fileids = self._fileids
-        elif isinstance(fileids, str):
+        elif isinstance(fileids, string_types):
             fileids = [fileids]
         return concat(
             [
@@ -101,7 +102,7 @@ class OpinionLexiconCorpusReader(WordListCorpusReader):
         :return: a list of positive words.
         :rtype: list(str)
         """
-        return self.words("positive-words.txt")
+        return self.words('positive-words.txt')
 
     def negative(self):
         """
@@ -110,7 +111,7 @@ class OpinionLexiconCorpusReader(WordListCorpusReader):
         :return: a list of negative words.
         :rtype: list(str)
         """
-        return self.words("negative-words.txt")
+        return self.words('negative-words.txt')
 
     def _read_word_block(self, stream):
         words = []
index ab71dc7..44bfb96 100644 (file)
@@ -1,6 +1,6 @@
 # Natural Language Toolkit: PanLex Corpus Reader
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: David Kamholz <kamholz@panlex.org>
 # URL: <http://nltk.org/>
 # For license information, see LICENSE.TXT
@@ -43,12 +43,12 @@ class PanLexLiteCorpusReader(CorpusReader):
     """
 
     def __init__(self, root):
-        self._c = sqlite3.connect(os.path.join(root, "db.sqlite")).cursor()
+        self._c = sqlite3.connect(os.path.join(root, 'db.sqlite')).cursor()
 
         self._uid_lv = {}
         self._lv_uid = {}
 
-        for row in self._c.execute("SELECT uid, lv FROM lv"):
+        for row in self._c.execute('SELECT uid, lv FROM lv'):
             self._uid_lv[row[0]] = row[1]
             self._lv_uid[row[1]] = row[0]
 
@@ -65,10 +65,10 @@ class PanLexLiteCorpusReader(CorpusReader):
         """
 
         if lc is None:
-            return self._c.execute("SELECT uid, tt FROM lv ORDER BY uid").fetchall()
+            return self._c.execute('SELECT uid, tt FROM lv ORDER BY uid').fetchall()
         else:
             return self._c.execute(
-                "SELECT uid, tt FROM lv WHERE lc = ? ORDER BY uid", (lc,)
+                'SELECT uid, tt FROM lv WHERE lc = ? ORDER BY uid', (lc,)
             ).fetchall()
 
     def meanings(self, expr_uid, expr_tt):
@@ -92,16 +92,16 @@ class PanLexLiteCorpusReader(CorpusReader):
 
             if not mn in mn_info:
                 mn_info[mn] = {
-                    "uq": i[1],
-                    "ap": i[2],
-                    "ui": i[3],
-                    "ex": {expr_uid: [expr_tt]},
+                    'uq': i[1],
+                    'ap': i[2],
+                    'ui': i[3],
+                    'ex': {expr_uid: [expr_tt]},
                 }
 
-            if not uid in mn_info[mn]["ex"]:
-                mn_info[mn]["ex"][uid] = []
+            if not uid in mn_info[mn]['ex']:
+                mn_info[mn]['ex'][uid] = []
 
-            mn_info[mn]["ex"][uid].append(i[4])
+            mn_info[mn]['ex'][uid].append(i[4])
 
         return [Meaning(mn, mn_info[mn]) for mn in mn_info]
 
@@ -134,35 +134,35 @@ class Meaning(dict):
 
     def __init__(self, mn, attr):
         super(Meaning, self).__init__(**attr)
-        self["mn"] = mn
+        self['mn'] = mn
 
     def id(self):
         """
         :return: the meaning's id.
         :rtype: int
         """
-        return self["mn"]
+        return self['mn']
 
     def quality(self):
         """
         :return: the meaning's source's quality (0=worst, 9=best).
         :rtype: int
         """
-        return self["uq"]
+        return self['uq']
 
     def source(self):
         """
         :return: the meaning's source id.
         :rtype: int
         """
-        return self["ap"]
+        return self['ap']
 
     def source_group(self):
         """
         :return: the meaning's source group id.
         :rtype: int
         """
-        return self["ui"]
+        return self['ui']
 
     def expressions(self):
         """
@@ -171,4 +171,4 @@ class Meaning(dict):
             texts.
         :rtype: dict
         """
-        return self["ex"]
+        return self['ex']
diff --git a/nlp_resource_data/nltk/corpus/reader/panlex_swadesh.py b/nlp_resource_data/nltk/corpus/reader/panlex_swadesh.py
deleted file mode 100644 (file)
index ed46a4b..0000000
+++ /dev/null
@@ -1,91 +0,0 @@
-# -*- coding: utf-8 -*-
-# Natural Language Toolkit: Word List Corpus Reader
-#
-# Copyright (C) 2001-2020 NLTK Project
-# Author: Steven Bird <stevenbird1@gmail.com>
-#         Edward Loper <edloper@gmail.com>
-# URL: <http://nltk.org/>
-# For license information, see LICENSE.TXT
-
-
-from collections import namedtuple, defaultdict
-import re
-
-from nltk.tokenize import line_tokenize
-
-from nltk.corpus.reader.wordlist import WordListCorpusReader
-from nltk.corpus.reader.util import *
-from nltk.corpus.reader.api import *
-
-PanlexLanguage = namedtuple('PanlexLanguage',
-                          ['panlex_uid',  # (1) PanLex UID
-                           'iso639',      # (2) ISO 639 language code
-                           'iso639_type', # (3) ISO 639 language type, see README
-                           'script',      # (4) normal scripts of expressions
-                           'name',        # (5) PanLex default name
-                           'langvar_uid'  # (6) UID of the language variety in which the default name is an expression
-                           ])
-
-class PanlexSwadeshCorpusReader(WordListCorpusReader):
-    """
-    This is a class to read the PanLex Swadesh list from
-
-    David Kamholz, Jonathan Pool, and Susan M. Colowick (2014).
-    PanLex: Building a Resource for Panlingual Lexical Translation.
-    In LREC. http://www.lrec-conf.org/proceedings/lrec2014/pdf/1029_Paper.pdf
-
-    License: CC0 1.0 Universal
-    https://creativecommons.org/publicdomain/zero/1.0/legalcode
-    """
-    def __init__(self, *args, **kwargs):
-        super(PanlexSwadeshCorpusReader, self).__init__(*args, **kwargs)
-        # Find the swadesh size using the fileids' path.
-        self.swadesh_size = re.match(r'swadesh([0-9].*)\/', self.fileids()[0]).group(1)
-        self._languages = {lang.panlex_uid:lang for lang in self.get_languages()}
-        self._macro_langauges = self.get_macrolanguages()
-
-    def license(self):
-        print('CC0 1.0 Universal')
-
-    def readme(self):
-        print(self.raw('README'))
-
-    def language_codes(self):
-        return self._languages.keys()
-
-    def get_languages(self):
-        for line in self.raw('langs{}.txt'.format(self.swadesh_size)).split('\n'):
-            if not line.strip(): # Skip empty lines.
-                continue
-            yield PanlexLanguage(*line.strip().split('\t'))
-
-    def get_macrolanguages(self):
-        macro_langauges = defaultdict(list)
-        for lang in self._languages.values():
-            macro_langauges[lang.iso639].append(lang.panlex_uid)
-        return macro_langauges
-
-    def words_by_lang(self, lang_code):
-        """
-        :return: a list of list(str)
-        """
-        fileid = 'swadesh{}/{}.txt'.format(self.swadesh_size, lang_code)
-        return [concept.split('\t') for concept in self.words(fileid)]
-
-    def words_by_iso639(self, iso63_code):
-        """
-        :return: a list of list(str)
-        """
-        fileids = ['swadesh{}/{}.txt'.format(self.swadesh_size, lang_code)
-                   for lang_code in self._macro_langauges[iso63_code]]
-        return [concept.split('\t') for fileid in fileids for concept in self.words(fileid)]
-
-    def entries(self, fileids=None):
-        """
-        :return: a tuple of words for the specified fileids.
-        """
-        if not fileids:
-            fileids = self.fileids()
-
-        wordlists = [self.words(f) for f in fileids]
-        return list(zip(*wordlists))
index aaf280d..a8a1f6f 100644 (file)
@@ -1,19 +1,21 @@
 # Natural Language Toolkit:
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Piotr Kasprzyk <p.j.kasprzyk@gmail.com>
 # URL: <http://nltk.org/>
 # For license information, see LICENSE.TXT
 
+from six import string_types
+
 from nltk.corpus.reader.api import *
 from nltk.corpus.reader.xmldocs import XMLCorpusReader
 
 
-PARA = re.compile(r"<p(?: [^>]*){0,1}>(.*?)</p>")
-SENT = re.compile(r"<s(?: [^>]*){0,1}>(.*?)</s>")
+PARA = re.compile(r'<p(?: [^>]*){0,1}>(.*?)</p>')
+SENT = re.compile(r'<s(?: [^>]*){0,1}>(.*?)</s>')
 
-TAGGEDWORD = re.compile(r"<([wc](?: [^>]*){0,1}>)(.*?)</[wc]>")
-WORD = re.compile(r"<[wc](?: [^>]*){0,1}>(.*?)</[wc]>")
+TAGGEDWORD = re.compile(r'<([wc](?: [^>]*){0,1}>)(.*?)</[wc]>')
+WORD = re.compile(r'<[wc](?: [^>]*){0,1}>(.*?)</[wc]>')
 
 TYPE = re.compile(r'type="(.*?)"')
 ANA = re.compile(r'ana="(.*?)"')
@@ -46,22 +48,22 @@ class TEICorpusView(StreamBackedCorpusView):
     def read_block(self, stream):
         block = stream.readlines(self._pagesize)
         block = concat(block)
-        while (block.count("<text id") > block.count("</text>")) or block.count(
-            "<text id"
+        while (block.count('<text id') > block.count('</text>')) or block.count(
+            '<text id'
         ) == 0:
             tmp = stream.readline()
             if len(tmp) <= 0:
                 break
             block += tmp
 
-        block = block.replace("\n", "")
+        block = block.replace('\n', '')
 
         textids = TEXTID.findall(block)
         if self._textids:
             for tid in textids:
                 if tid not in self._textids:
                     beg = block.find(tid) - 1
-                    end = block[beg:].find("</text>") + len("</text>")
+                    end = block[beg:].find('</text>') + len('</text>')
                     block = block[:beg] + block[beg + end :]
 
         output = []
@@ -84,7 +86,7 @@ class TEICorpusView(StreamBackedCorpusView):
 
     def _parse_tag(self, tag_word_tuple):
         (tag, word) = tag_word_tuple
-        if tag.startswith("w"):
+        if tag.startswith('w'):
             tag = ANA.search(tag).group(1)
         else:  # tag.startswith('c')
             tag = TYPE.search(tag).group(1)
@@ -95,8 +97,8 @@ class Pl196xCorpusReader(CategorizedCorpusReader, XMLCorpusReader):
     head_len = 2770
 
     def __init__(self, *args, **kwargs):
-        if "textid_file" in kwargs:
-            self._textids = kwargs["textid_file"]
+        if 'textid_file' in kwargs:
+            self._textids = kwargs['textid_file']
         else:
             self._textids = None
 
@@ -112,10 +114,10 @@ class Pl196xCorpusReader(CategorizedCorpusReader, XMLCorpusReader):
             with open(self._textids) as fp:
                 for line in fp:
                     line = line.strip()
-                    file_id, text_ids = line.split(" ", 1)
+                    file_id, text_ids = line.split(' ', 1)
                     if file_id not in self.fileids():
                         raise ValueError(
-                            "In text_id mapping file %s: %s not found"
+                            'In text_id mapping file %s: %s not found'
                             % (self._textids, file_id)
                         )
                     for text_id in text_ids.split(self._delimiter):
@@ -128,16 +130,16 @@ class Pl196xCorpusReader(CategorizedCorpusReader, XMLCorpusReader):
     def _resolve(self, fileids, categories, textids=None):
         tmp = None
         if (
-            len(list(
+            len(
                 filter(
                     lambda accessor: accessor is None, (fileids, categories, textids)
                 )
-            ))
+            )
             != 1
         ):
 
             raise ValueError(
-                "Specify exactly one of: fileids, " "categories or textids"
+                'Specify exactly one of: fileids, ' 'categories or textids'
             )
 
         if fileids is not None:
@@ -147,7 +149,7 @@ class Pl196xCorpusReader(CategorizedCorpusReader, XMLCorpusReader):
             return self.fileids(categories), None
 
         if textids is not None:
-            if isinstance(textids, str):
+            if isinstance(textids, string_types):
                 textids = [textids]
             files = sum((self._t2f[t] for t in textids), [])
             tdict = dict()
@@ -171,7 +173,7 @@ class Pl196xCorpusReader(CategorizedCorpusReader, XMLCorpusReader):
         if fileids is None:
             return sorted(self._t2f)
 
-        if isinstance(fileids, str):
+        if isinstance(fileids, string_types):
             fileids = [fileids]
         return sorted(sum((self._f2t[d] for d in fileids), []))
 
@@ -179,7 +181,7 @@ class Pl196xCorpusReader(CategorizedCorpusReader, XMLCorpusReader):
         fileids, textids = self._resolve(fileids, categories, textids)
         if fileids is None:
             fileids = self._fileids
-        elif isinstance(fileids, str):
+        elif isinstance(fileids, string_types):
             fileids = [fileids]
 
         if textids:
@@ -214,7 +216,7 @@ class Pl196xCorpusReader(CategorizedCorpusReader, XMLCorpusReader):
         fileids, textids = self._resolve(fileids, categories, textids)
         if fileids is None:
             fileids = self._fileids
-        elif isinstance(fileids, str):
+        elif isinstance(fileids, string_types):
             fileids = [fileids]
 
         if textids:
@@ -245,7 +247,7 @@ class Pl196xCorpusReader(CategorizedCorpusReader, XMLCorpusReader):
         fileids, textids = self._resolve(fileids, categories, textids)
         if fileids is None:
             fileids = self._fileids
-        elif isinstance(fileids, str):
+        elif isinstance(fileids, string_types):
             fileids = [fileids]
 
         if textids:
@@ -276,7 +278,7 @@ class Pl196xCorpusReader(CategorizedCorpusReader, XMLCorpusReader):
         fileids, textids = self._resolve(fileids, categories, textids)
         if fileids is None:
             fileids = self._fileids
-        elif isinstance(fileids, str):
+        elif isinstance(fileids, string_types):
             fileids = [fileids]
 
         if textids:
@@ -307,7 +309,7 @@ class Pl196xCorpusReader(CategorizedCorpusReader, XMLCorpusReader):
         fileids, textids = self._resolve(fileids, categories, textids)
         if fileids is None:
             fileids = self._fileids
-        elif isinstance(fileids, str):
+        elif isinstance(fileids, string_types):
             fileids = [fileids]
 
         if textids:
@@ -338,7 +340,7 @@ class Pl196xCorpusReader(CategorizedCorpusReader, XMLCorpusReader):
         fileids, textids = self._resolve(fileids, categories, textids)
         if fileids is None:
             fileids = self._fileids
-        elif isinstance(fileids, str):
+        elif isinstance(fileids, string_types):
             fileids = [fileids]
 
         if textids:
@@ -370,12 +372,12 @@ class Pl196xCorpusReader(CategorizedCorpusReader, XMLCorpusReader):
         if len(fileids) == 1:
             return XMLCorpusReader.xml(self, fileids[0])
         else:
-            raise TypeError("Expected a single file")
+            raise TypeError('Expected a single file')
 
     def raw(self, fileids=None, categories=None):
         fileids, _ = self._resolve(fileids, categories)
         if fileids is None:
             fileids = self._fileids
-        elif isinstance(fileids, str):
+        elif isinstance(fileids, string_types):
             fileids = [fileids]
         return concat([self.open(f).read() for f in fileids])
index 17f484b..4de7787 100644 (file)
@@ -1,6 +1,6 @@
 # Natural Language Toolkit: Plaintext Corpus Reader
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Steven Bird <stevenbird1@gmail.com>
 #         Edward Loper <edloper@gmail.com>
 #         Nitin Madnani <nmadnani@umiacs.umd.edu>
@@ -40,9 +40,9 @@ class PlaintextCorpusReader(CorpusReader):
         root,
         fileids,
         word_tokenizer=WordPunctTokenizer(),
-        sent_tokenizer=nltk.data.LazyLoader("tokenizers/punkt/english.pickle"),
+        sent_tokenizer=nltk.data.LazyLoader('tokenizers/punkt/english.pickle'),
         para_block_reader=read_blankline_block,
-        encoding="utf8",
+        encoding='utf8',
     ):
         """
         Construct a new plaintext corpus reader for a set of documents
@@ -72,7 +72,7 @@ class PlaintextCorpusReader(CorpusReader):
         """
         if fileids is None:
             fileids = self._fileids
-        elif isinstance(fileids, str):
+        elif isinstance(fileids, string_types):
             fileids = [fileids]
         raw_texts = []
         for f in fileids:
@@ -102,7 +102,7 @@ class PlaintextCorpusReader(CorpusReader):
         :rtype: list(list(str))
         """
         if self._sent_tokenizer is None:
-            raise ValueError("No sentence tokenizer for this corpus")
+            raise ValueError('No sentence tokenizer for this corpus')
 
         return concat(
             [
@@ -119,7 +119,7 @@ class PlaintextCorpusReader(CorpusReader):
         :rtype: list(list(list(str)))
         """
         if self._sent_tokenizer is None:
-            raise ValueError("No sentence tokenizer for this corpus")
+            raise ValueError('No sentence tokenizer for this corpus')
 
         return concat(
             [
@@ -175,7 +175,7 @@ class CategorizedPlaintextCorpusReader(CategorizedCorpusReader, PlaintextCorpusR
 
     def _resolve(self, fileids, categories):
         if fileids is not None and categories is not None:
-            raise ValueError("Specify fileids or categories, not both")
+            raise ValueError('Specify fileids or categories, not both')
         if categories is not None:
             return self.fileids(categories)
         else:
@@ -200,8 +200,8 @@ class CategorizedPlaintextCorpusReader(CategorizedCorpusReader, PlaintextCorpusR
 class PortugueseCategorizedPlaintextCorpusReader(CategorizedPlaintextCorpusReader):
     def __init__(self, *args, **kwargs):
         CategorizedCorpusReader.__init__(self, kwargs)
-        kwargs["sent_tokenizer"] = nltk.data.LazyLoader(
-            "tokenizers/punkt/portuguese.pickle"
+        kwargs['sent_tokenizer'] = nltk.data.LazyLoader(
+            'tokenizers/punkt/portuguese.pickle'
         )
         PlaintextCorpusReader.__init__(self, *args, **kwargs)
 
@@ -259,5 +259,5 @@ class EuroparlCorpusReader(PlaintextCorpusReader):
 
     def paras(self, fileids=None):
         raise NotImplementedError(
-            "The Europarl corpus reader does not support paragraphs. Please use chapters() instead."
+            'The Europarl corpus reader does not support paragraphs. Please use chapters() instead.'
         )
index 60c2b02..3bc06e4 100644 (file)
@@ -1,6 +1,6 @@
 # Natural Language Toolkit: PP Attachment Corpus Reader
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Steven Bird <stevenbird1@gmail.com>
 #         Edward Loper <edloper@gmail.com>
 # URL: <http://nltk.org/>
@@ -37,11 +37,16 @@ Conference.  [http://www.cis.upenn.edu/~adwait/papers/hlt94.ps]
 The PP Attachment Corpus is distributed with NLTK with the permission
 of the author.
 """
+from __future__ import unicode_literals
 
+from six import string_types
+
+from nltk import compat
 from nltk.corpus.reader.util import *
 from nltk.corpus.reader.api import *
 
 
+@compat.python_2_unicode_compatible
 class PPAttachment(object):
     def __init__(self, sent, verb, noun1, prep, noun2, attachment):
         self.sent = sent
@@ -53,8 +58,8 @@ class PPAttachment(object):
 
     def __repr__(self):
         return (
-            "PPAttachment(sent=%r, verb=%r, noun1=%r, prep=%r, "
-            "noun2=%r, attachment=%r)"
+            'PPAttachment(sent=%r, verb=%r, noun1=%r, prep=%r, '
+            'noun2=%r, attachment=%r)'
             % (self.sent, self.verb, self.noun1, self.prep, self.noun2, self.attachment)
         )
 
@@ -83,7 +88,7 @@ class PPAttachmentCorpusReader(CorpusReader):
     def raw(self, fileids=None):
         if fileids is None:
             fileids = self._fileids
-        elif isinstance(fileids, str):
+        elif isinstance(fileids, string_types):
             fileids = [fileids]
         return concat([self.open(f).read() for f in fileids])
 
index 7c49edc..5c9bdd9 100644 (file)
@@ -1,14 +1,17 @@
 # Natural Language Toolkit: PropBank Corpus Reader
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Edward Loper <edloper@gmail.com>
 # URL: <http://nltk.org/>
 # For license information, see LICENSE.TXT
 
+from __future__ import unicode_literals
 import re
 from functools import total_ordering
 from xml.etree import ElementTree
 
+from six import string_types
+
 from nltk.tree import Tree
 from nltk.internals import raise_unorderable_types
 
@@ -34,11 +37,11 @@ class PropbankCorpusReader(CorpusReader):
         self,
         root,
         propfile,
-        framefiles="",
+        framefiles='',
         verbsfile=None,
         parse_fileid_xform=None,
         parse_corpus=None,
-        encoding="utf8",
+        encoding='utf8',
     ):
         """
         :param root: The root directory for this corpus.
@@ -55,7 +58,7 @@ class PropbankCorpusReader(CorpusReader):
             necessary to resolve the tree pointers used by propbank.
         """
         # If framefiles is specified as a regexp, expand it.
-        if isinstance(framefiles, str):
+        if isinstance(framefiles, string_types):
             framefiles = find_corpus_fileids(root, framefiles)
         framefiles = list(framefiles)
         # Initialze the corpus reader.
@@ -85,7 +88,7 @@ class PropbankCorpusReader(CorpusReader):
         """
         kwargs = {}
         if baseform is not None:
-            kwargs["instance_filter"] = lambda inst: inst.baseform == baseform
+            kwargs['instance_filter'] = lambda inst: inst.baseform == baseform
         return StreamBackedCorpusView(
             self.abspath(self._propfile),
             lambda stream: self._read_instance_block(stream, **kwargs),
@@ -107,27 +110,27 @@ class PropbankCorpusReader(CorpusReader):
         """
         :return: the xml description for the given roleset.
         """
-        baseform = roleset_id.split(".")[0]
-        framefile = "frames/%s.xml" % baseform
+        baseform = roleset_id.split('.')[0]
+        framefile = 'frames/%s.xml' % baseform
         if framefile not in self._framefiles:
-            raise ValueError("Frameset file for %s not found" % roleset_id)
+            raise ValueError('Frameset file for %s not found' % roleset_id)
 
         # n.b.: The encoding for XML fileids is specified by the file
         # itself; so we ignore self._encoding here.
         etree = ElementTree.parse(self.abspath(framefile).open()).getroot()
-        for roleset in etree.findall("predicate/roleset"):
-            if roleset.attrib["id"] == roleset_id:
+        for roleset in etree.findall('predicate/roleset'):
+            if roleset.attrib['id'] == roleset_id:
                 return roleset
-        raise ValueError("Roleset %s not found in %s" % (roleset_id, framefile))
+        raise ValueError('Roleset %s not found in %s' % (roleset_id, framefile))
 
     def rolesets(self, baseform=None):
         """
         :return: list of xml descriptions for rolesets.
         """
         if baseform is not None:
-            framefile = "frames/%s.xml" % baseform
+            framefile = 'frames/%s.xml' % baseform
             if framefile not in self._framefiles:
-                raise ValueError("Frameset file for %s not found" % baseform)
+                raise ValueError('Frameset file for %s not found' % baseform)
             framefiles = [framefile]
         else:
             framefiles = self._framefiles
@@ -137,7 +140,7 @@ class PropbankCorpusReader(CorpusReader):
             # n.b.: The encoding for XML fileids is specified by the file
             # itself; so we ignore self._encoding here.
             etree = ElementTree.parse(self.abspath(framefile).open()).getroot()
-            rsets.append(etree.findall("predicate/roleset"))
+            rsets.append(etree.findall('predicate/roleset'))
         return LazyConcatenation(rsets)
 
     def verbs(self):
@@ -172,7 +175,7 @@ class PropbankCorpusReader(CorpusReader):
 ######################################################################
 
 
-
+@compat.python_2_unicode_compatible
 class PropbankInstance(object):
     def __init__(
         self,
@@ -231,27 +234,27 @@ class PropbankInstance(object):
     @property
     def baseform(self):
         """The baseform of the predicate."""
-        return self.roleset.split(".")[0]
+        return self.roleset.split('.')[0]
 
     @property
     def sensenumber(self):
         """The sense number of the predicate."""
-        return self.roleset.split(".")[1]
+        return self.roleset.split('.')[1]
 
     @property
     def predid(self):
         """Identifier of the predicate."""
-        return "rel"
+        return 'rel'
 
     def __repr__(self):
-        return "<PropbankInstance: %s, sent %s, word %s>" % (
+        return '<PropbankInstance: %s, sent %s, word %s>' % (
             self.fileid,
             self.sentnum,
             self.wordnum,
         )
 
     def __str__(self):
-        s = "%s %s %s %s %s %s" % (
+        s = '%s %s %s %s %s %s' % (
             self.fileid,
             self.sentnum,
             self.wordnum,
@@ -259,9 +262,9 @@ class PropbankInstance(object):
             self.roleset,
             self.inflection,
         )
-        items = self.arguments + ((self.predicate, "rel"),)
+        items = self.arguments + ((self.predicate, 'rel'),)
         for (argloc, argid) in sorted(items):
-            s += " %s-%s" % (argloc, argid)
+            s += ' %s-%s' % (argloc, argid)
         return s
 
     def _get_tree(self):
@@ -282,14 +285,14 @@ class PropbankInstance(object):
     def parse(s, parse_fileid_xform=None, parse_corpus=None):
         pieces = s.split()
         if len(pieces) < 7:
-            raise ValueError("Badly formatted propbank line: %r" % s)
+            raise ValueError('Badly formatted propbank line: %r' % s)
 
         # Divide the line into its basic pieces.
         (fileid, sentnum, wordnum, tagger, roleset, inflection) = pieces[:6]
-        rel = [p for p in pieces[6:] if p.endswith("-rel")]
-        args = [p for p in pieces[6:] if not p.endswith("-rel")]
+        rel = [p for p in pieces[6:] if p.endswith('-rel')]
+        args = [p for p in pieces[6:] if not p.endswith('-rel')]
         if len(rel) != 1:
-            raise ValueError("Badly formatted propbank line: %r" % s)
+            raise ValueError('Badly formatted propbank line: %r' % s)
 
         # Apply the fileid selector, if any.
         if parse_fileid_xform is not None:
@@ -308,7 +311,7 @@ class PropbankInstance(object):
         # Parse the arguments.
         arguments = []
         for arg in args:
-            argloc, argid = arg.split("-", 1)
+            argloc, argid = arg.split('-', 1)
             arguments.append((PropbankTreePointer.parse(argloc), argid))
 
         # Put it all together.
@@ -345,7 +348,7 @@ class PropbankPointer(object):
             raise NotImplementedError()
 
 
-
+@compat.python_2_unicode_compatible
 class PropbankChainTreePointer(PropbankPointer):
     def __init__(self, pieces):
         self.pieces = pieces
@@ -354,18 +357,18 @@ class PropbankChainTreePointer(PropbankPointer):
            ``PropbankTreePointer`` pointers."""
 
     def __str__(self):
-        return "*".join("%s" % p for p in self.pieces)
+        return '*'.join('%s' % p for p in self.pieces)
 
     def __repr__(self):
-        return "<PropbankChainTreePointer: %s>" % self
+        return '<PropbankChainTreePointer: %s>' % self
 
     def select(self, tree):
         if tree is None:
-            raise ValueError("Parse tree not avaialable")
-        return Tree("*CHAIN*", [p.select(tree) for p in self.pieces])
-
+            raise ValueError('Parse tree not avaialable')
+        return Tree('*CHAIN*', [p.select(tree) for p in self.pieces])
 
 
+@compat.python_2_unicode_compatible
 class PropbankSplitTreePointer(PropbankPointer):
     def __init__(self, pieces):
         self.pieces = pieces
@@ -373,19 +376,19 @@ class PropbankSplitTreePointer(PropbankPointer):
            all ``PropbankTreePointer`` pointers."""
 
     def __str__(self):
-        return ",".join("%s" % p for p in self.pieces)
+        return ','.join('%s' % p for p in self.pieces)
 
     def __repr__(self):
-        return "<PropbankSplitTreePointer: %s>" % self
+        return '<PropbankSplitTreePointer: %s>' % self
 
     def select(self, tree):
         if tree is None:
-            raise ValueError("Parse tree not avaialable")
-        return Tree("*SPLIT*", [p.select(tree) for p in self.pieces])
+            raise ValueError('Parse tree not avaialable')
+        return Tree('*SPLIT*', [p.select(tree) for p in self.pieces])
 
 
 @total_ordering
-
+@compat.python_2_unicode_compatible
 class PropbankTreePointer(PropbankPointer):
     """
     wordnum:height*wordnum:height*...
@@ -400,30 +403,30 @@ class PropbankTreePointer(PropbankPointer):
     @staticmethod
     def parse(s):
         # Deal with chains (xx*yy*zz)
-        pieces = s.split("*")
+        pieces = s.split('*')
         if len(pieces) > 1:
             return PropbankChainTreePointer(
                 [PropbankTreePointer.parse(elt) for elt in pieces]
             )
 
         # Deal with split args (xx,yy,zz)
-        pieces = s.split(",")
+        pieces = s.split(',')
         if len(pieces) > 1:
             return PropbankSplitTreePointer(
                 [PropbankTreePointer.parse(elt) for elt in pieces]
             )
 
         # Deal with normal pointers.
-        pieces = s.split(":")
+        pieces = s.split(':')
         if len(pieces) != 2:
-            raise ValueError("bad propbank pointer %r" % s)
+            raise ValueError('bad propbank pointer %r' % s)
         return PropbankTreePointer(int(pieces[0]), int(pieces[1]))
 
     def __str__(self):
-        return "%s:%s" % (self.wordnum, self.height)
+        return '%s:%s' % (self.wordnum, self.height)
 
     def __repr__(self):
-        return "PropbankTreePointer(%d, %d)" % (self.wordnum, self.height)
+        return 'PropbankTreePointer(%d, %d)' % (self.wordnum, self.height)
 
     def __eq__(self, other):
         while isinstance(other, (PropbankChainTreePointer, PropbankSplitTreePointer)):
@@ -448,7 +451,7 @@ class PropbankTreePointer(PropbankPointer):
 
     def select(self, tree):
         if tree is None:
-            raise ValueError("Parse tree not avaialable")
+            raise ValueError('Parse tree not avaialable')
         return tree[self.treepos(tree)]
 
     def treepos(self, tree):
@@ -457,12 +460,14 @@ class PropbankTreePointer(PropbankPointer):
         given that it points to the given tree.
         """
         if tree is None:
-            raise ValueError("Parse tree not avaialable")
+            raise ValueError('Parse tree not avaialable')
         stack = [tree]
         treepos = []
 
         wordnum = 0
         while True:
+            # print treepos
+            # print stack[-1]
             # tree node:
             if isinstance(stack[-1], Tree):
                 # Select the next child.
@@ -486,31 +491,31 @@ class PropbankTreePointer(PropbankPointer):
                     stack.pop()
 
 
-
+@compat.python_2_unicode_compatible
 class PropbankInflection(object):
     # { Inflection Form
-    INFINITIVE = "i"
-    GERUND = "g"
-    PARTICIPLE = "p"
-    FINITE = "v"
+    INFINITIVE = 'i'
+    GERUND = 'g'
+    PARTICIPLE = 'p'
+    FINITE = 'v'
     # { Inflection Tense
-    FUTURE = "f"
-    PAST = "p"
-    PRESENT = "n"
+    FUTURE = 'f'
+    PAST = 'p'
+    PRESENT = 'n'
     # { Inflection Aspect
-    PERFECT = "p"
-    PROGRESSIVE = "o"
-    PERFECT_AND_PROGRESSIVE = "b"
+    PERFECT = 'p'
+    PROGRESSIVE = 'o'
+    PERFECT_AND_PROGRESSIVE = 'b'
     # { Inflection Person
-    THIRD_PERSON = "3"
+    THIRD_PERSON = '3'
     # { Inflection Voice
-    ACTIVE = "a"
-    PASSIVE = "p"
+    ACTIVE = 'a'
+    PASSIVE = 'p'
     # { Inflection
-    NONE = "-"
+    NONE = '-'
     # }
 
-    def __init__(self, form="-", tense="-", aspect="-", person="-", voice="-"):
+    def __init__(self, form='-', tense='-', aspect='-', person='-', voice='-'):
         self.form = form
         self.tense = tense
         self.aspect = aspect
@@ -521,14 +526,14 @@ class PropbankInflection(object):
         return self.form + self.tense + self.aspect + self.person + self.voice
 
     def __repr__(self):
-        return "<PropbankInflection: %s>" % self
+        return '<PropbankInflection: %s>' % self
 
-    _VALIDATE = re.compile(r"[igpv\-][fpn\-][pob\-][3\-][ap\-]$")
+    _VALIDATE = re.compile(r'[igpv\-][fpn\-][pob\-][3\-][ap\-]$')
 
     @staticmethod
     def parse(s):
-        if not isinstance(s, str):
-            raise TypeError("expected a string")
+        if not isinstance(s, string_types):
+            raise TypeError('expected a string')
         if len(s) != 5 or not PropbankInflection._VALIDATE.match(s):
-            raise ValueError("Bad propbank inflection string %r" % s)
+            raise ValueError('Bad propbank inflection string %r' % s)
         return PropbankInflection(*s)
index ca9e540..8117918 100644 (file)
@@ -1,6 +1,6 @@
 # Natural Language Toolkit: Pros and Cons Corpus Reader
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Pierpaolo Pantone <24alsecondo@gmail.com>
 # URL: <http://nltk.org/>
 # For license information, see LICENSE.TXT
@@ -27,6 +27,8 @@ Related papers:
 """
 import re
 
+from six import string_types
+
 from nltk.corpus.reader.api import *
 from nltk.tokenize import *
 
@@ -51,7 +53,7 @@ class ProsConsCorpusReader(CategorizedCorpusReader, CorpusReader):
         root,
         fileids,
         word_tokenizer=WordPunctTokenizer(),
-        encoding="utf8",
+        encoding='utf8',
         **kwargs
     ):
         """
@@ -82,7 +84,7 @@ class ProsConsCorpusReader(CategorizedCorpusReader, CorpusReader):
         fileids = self._resolve(fileids, categories)
         if fileids is None:
             fileids = self._fileids
-        elif isinstance(fileids, str):
+        elif isinstance(fileids, string_types):
             fileids = [fileids]
         return concat(
             [
@@ -106,7 +108,7 @@ class ProsConsCorpusReader(CategorizedCorpusReader, CorpusReader):
         fileids = self._resolve(fileids, categories)
         if fileids is None:
             fileids = self._fileids
-        elif isinstance(fileids, str):
+        elif isinstance(fileids, string_types):
             fileids = [fileids]
         return concat(
             [
@@ -134,7 +136,7 @@ class ProsConsCorpusReader(CategorizedCorpusReader, CorpusReader):
 
     def _resolve(self, fileids, categories):
         if fileids is not None and categories is not None:
-            raise ValueError("Specify fileids or categories, not both")
+            raise ValueError('Specify fileids or categories, not both')
         if categories is not None:
             return self.fileids(categories)
         else:
index fc0b61c..9a1f173 100644 (file)
@@ -1,6 +1,6 @@
 # Natural Language Toolkit: Product Reviews Corpus Reader
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Pierpaolo Pantone <24alsecondo@gmail.com>
 # URL: <http://nltk.org/>
 # For license information, see LICENSE.TXT
@@ -59,19 +59,24 @@ Note: Some of the files (e.g. "ipod.txt", "Canon PowerShot SD500.txt") do not
     consideration.
 """
 
+from __future__ import division
+
 import re
 
+from six import string_types
+
 from nltk.corpus.reader.api import *
 from nltk.tokenize import *
 
-TITLE = re.compile(r"^\[t\](.*)$")  # [t] Title
+TITLE = re.compile(r'^\[t\](.*)$')  # [t] Title
 FEATURES = re.compile(
-    r"((?:(?:\w+\s)+)?\w+)\[((?:\+|\-)\d)\]"
+    r'((?:(?:\w+\s)+)?\w+)\[((?:\+|\-)\d)\]'
 )  # find 'feature' in feature[+3]
-NOTES = re.compile(r"\[(?!t)(p|u|s|cc|cs)\]")  # find 'p' in camera[+2][p]
-SENT = re.compile(r"##(.*)$")  # find tokenized sentence
+NOTES = re.compile(r'\[(?!t)(p|u|s|cc|cs)\]')  # find 'p' in camera[+2][p]
+SENT = re.compile(r'##(.*)$')  # find tokenized sentence
 
 
+@compat.python_2_unicode_compatible
 class Review(object):
     """
     A Review is the main block of a ReviewsCorpusReader.
@@ -120,11 +125,12 @@ class Review(object):
         return [review_line.sent for review_line in self.review_lines]
 
     def __repr__(self):
-        return 'Review(title="{}", review_lines={})'.format(
+        return 'Review(title=\"{}\", review_lines={})'.format(
             self.title, self.review_lines
         )
 
 
+@compat.python_2_unicode_compatible
 class ReviewLine(object):
     """
     A ReviewLine represents a sentence of the review, together with (optional)
@@ -144,7 +150,7 @@ class ReviewLine(object):
             self.notes = notes
 
     def __repr__(self):
-        return "ReviewLine(features={}, notes={}, sent={})".format(
+        return 'ReviewLine(features={}, notes={}, sent={})'.format(
             self.features, self.notes, self.sent
         )
 
@@ -174,8 +180,10 @@ class ReviewsCorpusReader(CorpusReader):
 
     We can compute stats for specific product features:
 
+        >>> from __future__ import division
         >>> n_reviews = len([(feat,score) for (feat,score) in product_reviews_1.features('Canon_G3.txt') if feat=='picture'])
         >>> tot = sum([int(score) for (feat,score) in product_reviews_1.features('Canon_G3.txt') if feat=='picture'])
+        >>> # We use float for backward compatibility with division in Python2.7
         >>> mean = tot / n_reviews
         >>> print(n_reviews, tot, mean)
         15 24 1.6
@@ -184,7 +192,7 @@ class ReviewsCorpusReader(CorpusReader):
     CorpusView = StreamBackedCorpusView
 
     def __init__(
-        self, root, fileids, word_tokenizer=WordPunctTokenizer(), encoding="utf8"
+        self, root, fileids, word_tokenizer=WordPunctTokenizer(), encoding='utf8'
     ):
         """
         :param root: The root directory for the corpus.
@@ -209,7 +217,7 @@ class ReviewsCorpusReader(CorpusReader):
         """
         if fileids is None:
             fileids = self._fileids
-        elif isinstance(fileids, str):
+        elif isinstance(fileids, string_types):
             fileids = [fileids]
         return concat(
             [
@@ -227,7 +235,7 @@ class ReviewsCorpusReader(CorpusReader):
         """
         if fileids is None:
             fileids = self._fileids
-        elif isinstance(fileids, str):
+        elif isinstance(fileids, string_types):
             fileids = [fileids]
         return concat([self.open(f).read() for f in fileids])
 
index 9538f47..0b0cd44 100644 (file)
@@ -1,6 +1,6 @@
 # Natural Language Toolkit: RTE Corpus Reader
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author:  Ewan Klein <ewan@inf.ed.ac.uk>
 # URL: <http://nltk.org/>
 # For license information, see LICENSE.TXT
@@ -32,6 +32,11 @@ In order to provide globally unique IDs for each pair, a new attribute
 file, taking values 1, 2 or 3. The GID is formatted 'm-n', where 'm' is the
 challenge number and 'n' is the pair ID.
 """
+from __future__ import unicode_literals
+
+from six import string_types
+
+from nltk import compat
 from nltk.corpus.reader.util import *
 from nltk.corpus.reader.api import *
 from nltk.corpus.reader.xmldocs import *
@@ -51,6 +56,7 @@ def norm(value_string):
     return valdict[value_string.upper()]
 
 
+@compat.python_2_unicode_compatible
 class RTEPair(object):
     """
     Container for RTE text-hypothesis pairs.
@@ -103,9 +109,9 @@ class RTEPair(object):
 
     def __repr__(self):
         if self.challenge:
-            return "<RTEPair: gid=%s-%s>" % (self.challenge, self.id)
+            return '<RTEPair: gid=%s-%s>' % (self.challenge, self.id)
         else:
-            return "<RTEPair: id=%s>" % self.id
+            return '<RTEPair: id=%s>' % self.id
 
 
 class RTECorpusReader(XMLCorpusReader):
@@ -127,7 +133,7 @@ class RTECorpusReader(XMLCorpusReader):
         :rtype: list(RTEPair)
         """
         try:
-            challenge = doc.attrib["challenge"]
+            challenge = doc.attrib['challenge']
         except KeyError:
             challenge = None
         return [RTEPair(pair, challenge=challenge) for pair in doc.getiterator("pair")]
@@ -140,6 +146,6 @@ class RTECorpusReader(XMLCorpusReader):
         :type: list
         :rtype: list(RTEPair)
         """
-        if isinstance(fileids, str):
+        if isinstance(fileids, string_types):
             fileids = [fileids]
         return concat([self._read_etree(self.xml(fileid)) for fileid in fileids])
index f04ea45..1b6f515 100644 (file)
@@ -1,6 +1,6 @@
 # Natural Language Toolkit: SemCor Corpus Reader
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Nathan Schneider <nschneid@cs.cmu.edu>
 # URL: <http://nltk.org/>
 # For license information, see LICENSE.TXT
@@ -8,8 +8,9 @@
 """
 Corpus reader for the SemCor Corpus.
 """
+from __future__ import absolute_import, unicode_literals
 
-__docformat__ = "epytext en"
+__docformat__ = 'epytext en'
 
 from nltk.corpus.reader.api import *
 from nltk.corpus.reader.xmldocs import XMLCorpusReader, XMLCorpusView
@@ -34,7 +35,7 @@ class SemcorCorpusReader(XMLCorpusReader):
         :return: the given file(s) as a list of words and punctuation symbols.
         :rtype: list(str)
         """
-        return self._items(fileids, "word", False, False, False)
+        return self._items(fileids, 'word', False, False, False)
 
     def chunks(self, fileids=None):
         """
@@ -43,9 +44,9 @@ class SemcorCorpusReader(XMLCorpusReader):
             that form a unit.
         :rtype: list(list(str))
         """
-        return self._items(fileids, "chunk", False, False, False)
+        return self._items(fileids, 'chunk', False, False, False)
 
-    def tagged_chunks(self, fileids=None, tag=("pos" or "sem" or "both")):
+    def tagged_chunks(self, fileids=None, tag=('pos' or 'sem' or 'both')):
         """
         :return: the given file(s) as a list of tagged chunks, represented
             in tree form.
@@ -58,7 +59,7 @@ class SemcorCorpusReader(XMLCorpusReader):
             have no lemma.  Other chunks not in WordNet have no semantic tag.
             Punctuation tokens have `None` for their part of speech tag.)
         """
-        return self._items(fileids, "chunk", False, tag != "sem", tag != "pos")
+        return self._items(fileids, 'chunk', False, tag != 'sem', tag != 'pos')
 
     def sents(self, fileids=None):
         """
@@ -66,7 +67,7 @@ class SemcorCorpusReader(XMLCorpusReader):
             as a list of word strings.
         :rtype: list(list(str))
         """
-        return self._items(fileids, "word", True, False, False)
+        return self._items(fileids, 'word', True, False, False)
 
     def chunk_sents(self, fileids=None):
         """
@@ -74,9 +75,9 @@ class SemcorCorpusReader(XMLCorpusReader):
             as a list of chunks.
         :rtype: list(list(list(str)))
         """
-        return self._items(fileids, "chunk", True, False, False)
+        return self._items(fileids, 'chunk', True, False, False)
 
-    def tagged_sents(self, fileids=None, tag=("pos" or "sem" or "both")):
+    def tagged_sents(self, fileids=None, tag=('pos' or 'sem' or 'both')):
         """
         :return: the given file(s) as a list of sentences. Each sentence
             is represented as a list of tagged chunks (in tree form).
@@ -89,10 +90,10 @@ class SemcorCorpusReader(XMLCorpusReader):
             have no lemma.  Other chunks not in WordNet have no semantic tag.
             Punctuation tokens have `None` for their part of speech tag.)
         """
-        return self._items(fileids, "chunk", True, tag != "sem", tag != "pos")
+        return self._items(fileids, 'chunk', True, tag != 'sem', tag != 'pos')
 
     def _items(self, fileids, unit, bracket_sent, pos_tag, sem_tag):
-        if unit == "word" and not bracket_sent:
+        if unit == 'word' and not bracket_sent:
             # the result of the SemcorWordView may be a multiword unit, so the
             # LazyConcatenation will make sure the sentence is flattened
             _ = lambda *args: LazyConcatenation(
@@ -121,23 +122,23 @@ class SemcorCorpusReader(XMLCorpusReader):
         :param sem_tag: Whether to include semantic tags, namely WordNet lemma
             and OOV named entity status.
         """
-        assert unit in ("token", "word", "chunk")
+        assert unit in ('token', 'word', 'chunk')
         result = []
 
         xmldoc = ElementTree.parse(fileid).getroot()
-        for xmlsent in xmldoc.findall(".//s"):
+        for xmlsent in xmldoc.findall('.//s'):
             sent = []
             for xmlword in _all_xmlwords_in(xmlsent):
                 itm = SemcorCorpusReader._word(
                     xmlword, unit, pos_tag, sem_tag, self._wordnet
                 )
-                if unit == "word":
+                if unit == 'word':
                     sent.extend(itm)
                 else:
                     sent.append(itm)
 
             if bracket_sent:
-                result.append(SemcorSentence(xmlsent.attrib["snum"], sent))
+                result.append(SemcorSentence(xmlsent.attrib['snum'], sent))
             else:
                 result.extend(sent)
 
@@ -150,29 +151,29 @@ class SemcorCorpusReader(XMLCorpusReader):
         if not tkn:
             tkn = ""  # fixes issue 337?
 
-        lemma = xmlword.get("lemma", tkn)  # lemma or NE class
-        lexsn = xmlword.get("lexsn")  # lex_sense (locator for the lemma's sense)
+        lemma = xmlword.get('lemma', tkn)  # lemma or NE class
+        lexsn = xmlword.get('lexsn')  # lex_sense (locator for the lemma's sense)
         if lexsn is not None:
-            sense_key = lemma + "%" + lexsn
-            wnpos = ("n", "v", "a", "r", "s")[
-                int(lexsn.split(":")[0]) - 1
+            sense_key = lemma + '%' + lexsn
+            wnpos = ('n', 'v', 'a', 'r', 's')[
+                int(lexsn.split(':')[0]) - 1
             ]  # see http://wordnet.princeton.edu/man/senseidx.5WN.html
         else:
             sense_key = wnpos = None
         redef = xmlword.get(
-            "rdf", tkn
+            'rdf', tkn
         )  # redefinition--this indicates the lookup string
         # does not exactly match the enclosed string, e.g. due to typographical adjustments
         # or discontinuity of a multiword expression. If a redefinition has occurred,
         # the "rdf" attribute holds its inflected form and "lemma" holds its lemma.
         # For NEs, "rdf", "lemma", and "pn" all hold the same value (the NE class).
-        sensenum = xmlword.get("wnsn")  # WordNet sense number
-        isOOVEntity = "pn" in xmlword.keys()  # a "personal name" (NE) not in WordNet
+        sensenum = xmlword.get('wnsn')  # WordNet sense number
+        isOOVEntity = 'pn' in xmlword.keys()  # a "personal name" (NE) not in WordNet
         pos = xmlword.get(
-            "pos"
+            'pos'
         )  # part of speech for the whole chunk (None for punctuation)
 
-        if unit == "token":
+        if unit == 'token':
             if not pos_tag and not sem_tag:
                 itm = tkn
             else:
@@ -183,8 +184,8 @@ class SemcorCorpusReader(XMLCorpusReader):
                 )
             return itm
         else:
-            ww = tkn.split("_")  # TODO: case where punctuation intervenes in MWE
-            if unit == "word":
+            ww = tkn.split('_')  # TODO: case where punctuation intervenes in MWE
+            if unit == 'word':
                 return ww
             else:
                 if sensenum is not None:
@@ -197,23 +198,23 @@ class SemcorCorpusReader(XMLCorpusReader):
                         #  nltk.corpus.reader.wordnet.WordNetError: No synset found for key u'such%5:00:01:specified:00'
                         # solution: just use the lemma name as a string
                         try:
-                            sense = "%s.%s.%02d" % (
+                            sense = '%s.%s.%02d' % (
                                 lemma,
                                 wnpos,
                                 int(sensenum),
                             )  # e.g.: reach.v.02
                         except ValueError:
                             sense = (
-                                lemma + "." + wnpos + "." + sensenum
+                                lemma + '.' + wnpos + '.' + sensenum
                             )  # e.g. the sense number may be "2;1"
 
                 bottom = [Tree(pos, ww)] if pos_tag else ww
 
                 if sem_tag and isOOVEntity:
                     if sensenum is not None:
-                        return Tree(sense, [Tree("NE", bottom)])
+                        return Tree(sense, [Tree('NE', bottom)])
                     else:  # 'other' NE
-                        return Tree("NE", bottom)
+                        return Tree('NE', bottom)
                 elif sem_tag and sensenum is not None:
                     return Tree(sense, bottom)
                 elif pos_tag:
@@ -226,7 +227,7 @@ def _all_xmlwords_in(elt, result=None):
     if result is None:
         result = []
     for child in elt:
-        if child.tag in ("wf", "punc"):
+        if child.tag in ('wf', 'punc'):
             result.append(child)
         else:
             _all_xmlwords_in(child, result)
@@ -259,9 +260,9 @@ class SemcorWordView(XMLCorpusView):
             and OOV named entity status.
         """
         if bracket_sent:
-            tagspec = ".*/s"
+            tagspec = '.*/s'
         else:
-            tagspec = ".*/s/(punc|wf)"
+            tagspec = '.*/s/(punc|wf)'
 
         self._unit = unit
         self._sent = bracket_sent
@@ -285,12 +286,12 @@ class SemcorWordView(XMLCorpusView):
     def handle_sent(self, elt):
         sent = []
         for child in elt:
-            if child.tag in ("wf", "punc"):
+            if child.tag in ('wf', 'punc'):
                 itm = self.handle_word(child)
-                if self._unit == "word":
+                if self._unit == 'word':
                     sent.extend(itm)
                 else:
                     sent.append(itm)
             else:
-                raise ValueError("Unexpected element %s" % child.tag)
-        return SemcorSentence(elt.attrib["snum"], sent)
+                raise ValueError('Unexpected element %s' % child.tag)
+        return SemcorSentence(elt.attrib['snum'], sent)
index 5d1a250..66a5386 100644 (file)
@@ -1,6 +1,6 @@
 # Natural Language Toolkit: Senseval 2 Corpus Reader
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Trevor Cohn <tacohn@cs.mu.oz.au>
 #         Steven Bird <stevenbird1@gmail.com> (modifications)
 # URL: <http://nltk.org/>
@@ -21,16 +21,21 @@ The NLTK version of the Senseval 2 files uses well-formed XML.
 Each instance of the ambiguous words "hard", "interest", "line", and "serve"
 is tagged with a sense identifier, and supplied with context.
 """
+from __future__ import print_function, unicode_literals
 
 import re
 from xml.etree import ElementTree
 
+from six import string_types
+
+from nltk import compat
 from nltk.tokenize import *
 
 from nltk.corpus.reader.util import *
 from nltk.corpus.reader.api import *
 
 
+@compat.python_2_unicode_compatible
 class SensevalInstance(object):
     def __init__(self, word, position, context, senses):
         self.word = word
@@ -39,7 +44,7 @@ class SensevalInstance(object):
         self.context = context
 
     def __repr__(self):
-        return "SensevalInstance(word=%r, position=%r, " "context=%r, senses=%r)" % (
+        return 'SensevalInstance(word=%r, position=%r, ' 'context=%r, senses=%r)' % (
             self.word,
             self.position,
             self.context,
@@ -62,16 +67,16 @@ class SensevalCorpusReader(CorpusReader):
         """
         if fileids is None:
             fileids = self._fileids
-        elif isinstance(fileids, str):
+        elif isinstance(fileids, string_types):
             fileids = [fileids]
         return concat([self.open(f).read() for f in fileids])
 
     def _entry(self, tree):
         elts = []
-        for lexelt in tree.findall("lexelt"):
-            for inst in lexelt.findall("instance"):
-                sense = inst[0].attrib["senseid"]
-                context = [(w.text, w.attrib["pos"]) for w in inst[1]]
+        for lexelt in tree.findall('lexelt'):
+            for inst in lexelt.findall('instance'):
+                sense = inst[0].attrib['senseid']
+                context = [(w.text, w.attrib['pos']) for w in inst[1]]
                 elts.append((sense, context))
         return elts
 
@@ -93,14 +98,14 @@ class SensevalCorpusView(StreamBackedCorpusView):
         in_instance = False
         while True:
             line = stream.readline()
-            if line == "":
+            if line == '':
                 assert instance_lines == []
                 return []
 
             # Start of a lexical element?
-            if line.lstrip().startswith("<lexelt"):
+            if line.lstrip().startswith('<lexelt'):
                 lexelt_num += 1
-                m = re.search("item=(\"[^\"]+\"|'[^']+')", line)
+                m = re.search('item=("[^"]+"|\'[^\']+\')', line)
                 assert m is not None  # <lexelt> has no 'item=...'
                 lexelt = m.group(1)[1:-1]
                 if lexelt_num < len(self._lexelts):
@@ -110,7 +115,7 @@ class SensevalCorpusView(StreamBackedCorpusView):
                     self._lexelt_starts.append(stream.tell())
 
             # Start of an instance?
-            if line.lstrip().startswith("<instance"):
+            if line.lstrip().startswith('<instance'):
                 assert instance_lines == []
                 in_instance = True
 
@@ -119,8 +124,8 @@ class SensevalCorpusView(StreamBackedCorpusView):
                 instance_lines.append(line)
 
             # End of an instance?
-            if line.lstrip().startswith("</instance"):
-                xml_block = "\n".join(instance_lines)
+            if line.lstrip().startswith('</instance'):
+                xml_block = '\n'.join(instance_lines)
                 xml_block = _fixXML(xml_block)
                 inst = ElementTree.fromstring(xml_block)
                 return [self._parse_instance(inst, lexelt)]
@@ -130,17 +135,17 @@ class SensevalCorpusView(StreamBackedCorpusView):
         context = []
         position = None
         for child in instance:
-            if child.tag == "answer":
-                senses.append(child.attrib["senseid"])
-            elif child.tag == "context":
+            if child.tag == 'answer':
+                senses.append(child.attrib['senseid'])
+            elif child.tag == 'context':
                 context += self._word_tokenizer.tokenize(child.text)
                 for cword in child:
-                    if cword.tag == "compound":
+                    if cword.tag == 'compound':
                         cword = cword[0]  # is this ok to do?
 
-                    if cword.tag == "head":
+                    if cword.tag == 'head':
                         # Some santiy checks:
-                        assert position is None, "head specified twice"
+                        assert position is None, 'head specified twice'
                         assert cword.text.strip() or len(cword) == 1
                         assert not (cword.text.strip() and len(cword) == 1)
                         # Record the position of the head:
@@ -148,24 +153,24 @@ class SensevalCorpusView(StreamBackedCorpusView):
                         # Addd on the head word itself:
                         if cword.text.strip():
                             context.append(cword.text.strip())
-                        elif cword[0].tag == "wf":
-                            context.append((cword[0].text, cword[0].attrib["pos"]))
+                        elif cword[0].tag == 'wf':
+                            context.append((cword[0].text, cword[0].attrib['pos']))
                             if cword[0].tail:
                                 context += self._word_tokenizer.tokenize(cword[0].tail)
                         else:
-                            assert False, "expected CDATA or wf in <head>"
-                    elif cword.tag == "wf":
-                        context.append((cword.text, cword.attrib["pos"]))
-                    elif cword.tag == "s":
+                            assert False, 'expected CDATA or wf in <head>'
+                    elif cword.tag == 'wf':
+                        context.append((cword.text, cword.attrib['pos']))
+                    elif cword.tag == 's':
                         pass  # Sentence boundary marker.
 
                     else:
-                        print("ACK", cword.tag)
-                        assert False, "expected CDATA or <wf> or <head>"
+                        print('ACK', cword.tag)
+                        assert False, 'expected CDATA or <wf> or <head>'
                     if cword.tail:
                         context += self._word_tokenizer.tokenize(cword.tail)
             else:
-                assert False, "unexpected tag %s" % child.tag
+                assert False, 'unexpected tag %s' % child.tag
         return SensevalInstance(lexelt, position, context, senses)
 
 
@@ -174,31 +179,31 @@ def _fixXML(text):
     Fix the various issues with Senseval pseudo-XML.
     """
     # <~> or <^> => ~ or ^
-    text = re.sub(r"<([~\^])>", r"\1", text)
+    text = re.sub(r'<([~\^])>', r'\1', text)
     # fix lone &
-    text = re.sub(r"(\s+)\&(\s+)", r"\1&amp;\2", text)
+    text = re.sub(r'(\s+)\&(\s+)', r'\1&amp;\2', text)
     # fix """
-    text = re.sub(r'"""', "'\"'", text)
+    text = re.sub(r'"""', '\'"\'', text)
     # fix <s snum=dd> => <s snum="dd"/>
     text = re.sub(r'(<[^<]*snum=)([^">]+)>', r'\1"\2"/>', text)
     # fix foreign word tag
-    text = re.sub(r"<\&frasl>\s*<p[^>]*>", "FRASL", text)
+    text = re.sub(r'<\&frasl>\s*<p[^>]*>', 'FRASL', text)
     # remove <&I .>
-    text = re.sub(r"<\&I[^>]*>", "", text)
+    text = re.sub(r'<\&I[^>]*>', '', text)
     # fix <{word}>
-    text = re.sub(r"<{([^}]+)}>", r"\1", text)
+    text = re.sub(r'<{([^}]+)}>', r'\1', text)
     # remove <@>, <p>, </p>
-    text = re.sub(r"<(@|/?p)>", r"", text)
+    text = re.sub(r'<(@|/?p)>', r'', text)
     # remove <&M .> and <&T .> and <&Ms .>
-    text = re.sub(r"<&\w+ \.>", r"", text)
+    text = re.sub(r'<&\w+ \.>', r'', text)
     # remove <!DOCTYPE... > lines
-    text = re.sub(r"<!DOCTYPE[^>]*>", r"", text)
+    text = re.sub(r'<!DOCTYPE[^>]*>', r'', text)
     # remove <[hi]> and <[/p]> etc
-    text = re.sub(r"<\[\/?[^>]+\]*>", r"", text)
+    text = re.sub(r'<\[\/?[^>]+\]*>', r'', text)
     # take the thing out of the brackets: <&hellip;>
-    text = re.sub(r"<(\&\w+;)>", r"\1", text)
+    text = re.sub(r'<(\&\w+;)>', r'\1', text)
     # and remove the & for those patterns that aren't regular XML
-    text = re.sub(r"&(?!amp|gt|lt|apos|quot)", r"", text)
+    text = re.sub(r'&(?!amp|gt|lt|apos|quot)', r'', text)
     # fix 'abc <p="foo"/>' style tags - now <wf pos="foo">abc</wf>
     text = re.sub(
         r'[ \t]*([^<>\s]+?)[ \t]*<p="([^"]*"?)"/>', r' <wf pos="\2">\1</wf>', text
index f0097c2..bbe4fc9 100644 (file)
@@ -1,7 +1,7 @@
 # -*- coding: utf-8 -*-
 # Natural Language Toolkit: SentiWordNet
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Christopher Potts <cgpotts@stanford.edu>
 # URL: <http://nltk.org/>
 # For license information, see LICENSE.TXT
@@ -37,19 +37,20 @@ http://sentiwordnet.isti.cnr.it/
 """
 
 import re
-
+from nltk.compat import python_2_unicode_compatible
 from nltk.corpus.reader import CorpusReader
 
 
+@python_2_unicode_compatible
 class SentiWordNetCorpusReader(CorpusReader):
-    def __init__(self, root, fileids, encoding="utf-8"):
+    def __init__(self, root, fileids, encoding='utf-8'):
         """
         Construct a new SentiWordNet Corpus Reader, using data from
        the specified file.
         """
         super(SentiWordNetCorpusReader, self).__init__(root, fileids, encoding=encoding)
         if len(self._fileids) != 1:
-            raise ValueError("Exactly one file must be specified")
+            raise ValueError('Exactly one file must be specified')
         self._db = {}
         self._parse_src_file()
 
@@ -61,7 +62,7 @@ class SentiWordNetCorpusReader(CorpusReader):
             try:
                 pos, offset, pos_score, neg_score, synset_terms, gloss = fields
             except:
-                raise ValueError("Line %s formatted incorrectly: %s\n" % (i, line))
+                raise ValueError('Line %s formatted incorrectly: %s\n' % (i, line))
             if pos and offset:
                 offset = int(offset)
                 self._db[(pos, offset)] = (float(pos_score), float(neg_score))
@@ -72,15 +73,15 @@ class SentiWordNetCorpusReader(CorpusReader):
         if tuple(vals) in self._db:
             pos_score, neg_score = self._db[tuple(vals)]
             pos, offset = vals
-            if pos == "s":
-                pos = "a"
+            if pos == 's':
+                pos = 'a'
             synset = wn.synset_from_pos_and_offset(pos, offset)
             return SentiSynset(pos_score, neg_score, synset)
         else:
             synset = wn.synset(vals[0])
             pos = synset.pos()
-            if pos == "s":
-                pos = "a"
+            if pos == 's':
+                pos = 'a'
             offset = synset.offset()
             if (pos, offset) in self._db:
                 pos_score, neg_score = self._db[(pos, offset)]
@@ -108,6 +109,7 @@ class SentiWordNetCorpusReader(CorpusReader):
             yield SentiSynset(pos_score, neg_score, synset)
 
 
+@python_2_unicode_compatible
 class SentiSynset(object):
     def __init__(self, pos_score, neg_score, synset):
         self._pos_score = pos_score
index 15b997c..fbbc92d 100644 (file)
@@ -1,6 +1,6 @@
 # Natural Language Toolkit: Sinica Treebank Reader
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Steven Bird <stevenbird1@gmail.com>
 # URL: <http://nltk.org/>
 # For license information, see LICENSE.TXT
@@ -44,10 +44,10 @@ from nltk.tag import map_tag
 from nltk.corpus.reader.util import *
 from nltk.corpus.reader.api import *
 
-IDENTIFIER = re.compile(r"^#\S+\s")
-APPENDIX = re.compile(r"(?<=\))#.*$")
-TAGWORD = re.compile(r":([^:()|]+):([^:()|]+)")
-WORD = re.compile(r":[^:()|]+:([^:()|]+)")
+IDENTIFIER = re.compile(r'^#\S+\s')
+APPENDIX = re.compile(r'(?<=\))#.*$')
+TAGWORD = re.compile(r':([^:()|]+):([^:()|]+)')
+WORD = re.compile(r':[^:()|]+:([^:()|]+)')
 
 
 class SinicaTreebankCorpusReader(SyntaxCorpusReader):
@@ -57,8 +57,8 @@ class SinicaTreebankCorpusReader(SyntaxCorpusReader):
 
     def _read_block(self, stream):
         sent = stream.readline()
-        sent = IDENTIFIER.sub("", sent)
-        sent = APPENDIX.sub("", sent)
+        sent = IDENTIFIER.sub('', sent)
+        sent = APPENDIX.sub('', sent)
         return [sent]
 
     def _parse(self, sent):
index 136a62e..eaf5bf4 100644 (file)
@@ -1,6 +1,6 @@
 # Natural Language Toolkit: String Category Corpus Reader
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Steven Bird <stevenbird1@gmail.com>
 #         Edward Loper <edloper@gmail.com>
 # URL: <http://nltk.org/>
@@ -18,6 +18,9 @@ NUM:date When did Hawaii become a state ?
 """
 
 # based on PPAttachmentCorpusReader
+from six import string_types
+
+from nltk import compat
 from nltk.corpus.reader.util import *
 from nltk.corpus.reader.api import *
 
@@ -25,7 +28,7 @@ from nltk.corpus.reader.api import *
 # in nltk, we use the form (data, tag) -- e.g., tagged words and
 # labeled texts for classifiers.
 class StringCategoryCorpusReader(CorpusReader):
-    def __init__(self, root, fileids, delimiter=" ", encoding="utf8"):
+    def __init__(self, root, fileids, delimiter=' ', encoding='utf8'):
         """
         :param root: The root directory for this corpus.
         :param fileids: A list or regexp specifying the fileids in this corpus.
@@ -37,7 +40,7 @@ class StringCategoryCorpusReader(CorpusReader):
     def tuples(self, fileids=None):
         if fileids is None:
             fileids = self._fileids
-        elif isinstance(fileids, str):
+        elif isinstance(fileids, string_types):
             fileids = [fileids]
         return concat(
             [
@@ -52,7 +55,7 @@ class StringCategoryCorpusReader(CorpusReader):
         """
         if fileids is None:
             fileids = self._fileids
-        elif isinstance(fileids, str):
+        elif isinstance(fileids, string_types):
             fileids = [fileids]
         return concat([self.open(f).read() for f in fileids])
 
index 593ef45..ed65c42 100644 (file)
@@ -1,17 +1,20 @@
 # Natural Language Toolkit: Switchboard Corpus Reader
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Edward Loper <edloper@gmail.com>
 # URL: <http://nltk.org/>
 # For license information, see LICENSE.TXT
+from __future__ import unicode_literals
 import re
 
 from nltk.tag import str2tuple, map_tag
+from nltk import compat
 
 from nltk.corpus.reader.util import *
 from nltk.corpus.reader.api import *
 
 
+@compat.python_2_unicode_compatible
 class SwitchboardTurn(list):
     """
     A specialized list object used to encode switchboard utterances.
@@ -28,16 +31,16 @@ class SwitchboardTurn(list):
 
     def __repr__(self):
         if len(self) == 0:
-            text = ""
+            text = ''
         elif isinstance(self[0], tuple):
-            text = " ".join("%s/%s" % w for w in self)
+            text = ' '.join('%s/%s' % w for w in self)
         else:
-            text = " ".join(self)
-        return "<%s.%s: %r>" % (self.speaker, self.id, text)
+            text = ' '.join(self)
+        return '<%s.%s: %r>' % (self.speaker, self.id, text)
 
 
 class SwitchboardCorpusReader(CorpusReader):
-    _FILES = ["tagged"]
+    _FILES = ['tagged']
     # Use the "tagged" file even for non-tagged data methods, since
     # it's tokenized.
 
@@ -46,26 +49,26 @@ class SwitchboardCorpusReader(CorpusReader):
         self._tagset = tagset
 
     def words(self):
-        return StreamBackedCorpusView(self.abspath("tagged"), self._words_block_reader)
+        return StreamBackedCorpusView(self.abspath('tagged'), self._words_block_reader)
 
     def tagged_words(self, tagset=None):
         def tagged_words_block_reader(stream):
             return self._tagged_words_block_reader(stream, tagset)
 
-        return StreamBackedCorpusView(self.abspath("tagged"), tagged_words_block_reader)
+        return StreamBackedCorpusView(self.abspath('tagged'), tagged_words_block_reader)
 
     def turns(self):
-        return StreamBackedCorpusView(self.abspath("tagged"), self._turns_block_reader)
+        return StreamBackedCorpusView(self.abspath('tagged'), self._turns_block_reader)
 
     def tagged_turns(self, tagset=None):
         def tagged_turns_block_reader(stream):
             return self._tagged_turns_block_reader(stream, tagset)
 
-        return StreamBackedCorpusView(self.abspath("tagged"), tagged_turns_block_reader)
+        return StreamBackedCorpusView(self.abspath('tagged'), tagged_turns_block_reader)
 
     def discourses(self):
         return StreamBackedCorpusView(
-            self.abspath("tagged"), self._discourses_block_reader
+            self.abspath('tagged'), self._discourses_block_reader
         )
 
     def tagged_discourses(self, tagset=False):
@@ -73,7 +76,7 @@ class SwitchboardCorpusReader(CorpusReader):
             return self._tagged_discourses_block_reader(stream, tagset)
 
         return StreamBackedCorpusView(
-            self.abspath("tagged"), tagged_discourses_block_reader
+            self.abspath('tagged'), tagged_discourses_block_reader
         )
 
     def _discourses_block_reader(self, stream):
@@ -82,7 +85,7 @@ class SwitchboardCorpusReader(CorpusReader):
             [
                 self._parse_utterance(u, include_tag=False)
                 for b in read_blankline_block(stream)
-                for u in b.split("\n")
+                for u in b.split('\n')
                 if u.strip()
             ]
         ]
@@ -93,7 +96,7 @@ class SwitchboardCorpusReader(CorpusReader):
             [
                 self._parse_utterance(u, include_tag=True, tagset=tagset)
                 for b in read_blankline_block(stream)
-                for u in b.split("\n")
+                for u in b.split('\n')
                 if u.strip()
             ]
         ]
@@ -110,13 +113,13 @@ class SwitchboardCorpusReader(CorpusReader):
     def _tagged_words_block_reader(self, stream, tagset=None):
         return sum(self._tagged_discourses_block_reader(stream, tagset)[0], [])
 
-    _UTTERANCE_RE = re.compile("(\w+)\.(\d+)\:\s*(.*)")
-    _SEP = "/"
+    _UTTERANCE_RE = re.compile('(\w+)\.(\d+)\:\s*(.*)')
+    _SEP = '/'
 
     def _parse_utterance(self, utterance, include_tag, tagset=None):
         m = self._UTTERANCE_RE.match(utterance)
         if m is None:
-            raise ValueError("Bad utterance %r" % utterance)
+            raise ValueError('Bad utterance %r' % utterance)
         speaker, id, text = m.groups()
         words = [str2tuple(s, self._SEP) for s in text.split()]
         if not include_tag:
index afd27b1..3af1653 100644 (file)
@@ -1,6 +1,6 @@
 # Natural Language Toolkit: Tagged Corpus Reader
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Edward Loper <edloper@gmail.com>
 #         Steven Bird <stevenbird1@gmail.com>
 #         Jacob Perkins <japerk@gmail.com>
@@ -13,6 +13,8 @@ A reader for corpora whose documents contain part-of-speech-tagged words.
 
 import os
 
+from six import string_types
+
 from nltk.tag import str2tuple, map_tag
 from nltk.tokenize import *
 
@@ -41,11 +43,11 @@ class TaggedCorpusReader(CorpusReader):
         self,
         root,
         fileids,
-        sep="/",
+        sep='/',
         word_tokenizer=WhitespaceTokenizer(),
-        sent_tokenizer=RegexpTokenizer("\n", gaps=True),
+        sent_tokenizer=RegexpTokenizer('\n', gaps=True),
         para_block_reader=read_blankline_block,
-        encoding="utf8",
+        encoding='utf8',
         tagset=None,
     ):
         """
@@ -72,7 +74,7 @@ class TaggedCorpusReader(CorpusReader):
         """
         if fileids is None:
             fileids = self._fileids
-        elif isinstance(fileids, str):
+        elif isinstance(fileids, string_types):
             fileids = [fileids]
         return concat([self.open(f).read() for f in fileids])
 
@@ -256,7 +258,7 @@ class CategorizedTaggedCorpusReader(CategorizedCorpusReader, TaggedCorpusReader)
 
     def _resolve(self, fileids, categories):
         if fileids is not None and categories is not None:
-            raise ValueError("Specify fileids or categories, not both")
+            raise ValueError('Specify fileids or categories, not both')
         if categories is not None:
             return self.fileids(categories)
         else:
@@ -358,21 +360,21 @@ class MacMorphoCorpusReader(TaggedCorpusReader):
     sentence.
     """
 
-    def __init__(self, root, fileids, encoding="utf8", tagset=None):
+    def __init__(self, root, fileids, encoding='utf8', tagset=None):
         TaggedCorpusReader.__init__(
             self,
             root,
             fileids,
-            sep="_",
+            sep='_',
             word_tokenizer=LineTokenizer(),
-            sent_tokenizer=RegexpTokenizer(".*\n"),
+            sent_tokenizer=RegexpTokenizer('.*\n'),
             para_block_reader=self._read_block,
             encoding=encoding,
             tagset=tagset,
         )
 
     def _read_block(self, stream):
-        return read_regexp_block(stream, r".*", r".*_\.")
+        return read_regexp_block(stream, r'.*', r'.*_\.')
 
 
 class TimitTaggedCorpusReader(TaggedCorpusReader):
@@ -386,7 +388,7 @@ class TimitTaggedCorpusReader(TaggedCorpusReader):
         )
 
     def paras(self):
-        raise NotImplementedError("use sents() instead")
+        raise NotImplementedError('use sents() instead')
 
     def tagged_paras(self):
-        raise NotImplementedError("use tagged_sents() instead")
+        raise NotImplementedError('use tagged_sents() instead')
index 7d63248..bbd57c6 100644 (file)
@@ -118,12 +118,17 @@ The 4 functions are as follows.
    timit.audiodata function.
 
 """
+from __future__ import print_function, unicode_literals
+
 import sys
 import os
 import re
 import tempfile
 import time
 
+from six import string_types
+
+from nltk import compat
 from nltk.tree import Tree
 from nltk.internals import import_from_stdlib
 
@@ -149,18 +154,18 @@ class TimitCorpusReader(CorpusReader):
       - <utterance-id>.wav: utterance sound file
     """
 
-    _FILE_RE = r"(\w+-\w+/\w+\.(phn|txt|wav|wrd))|" + r"timitdic\.txt|spkrinfo\.txt"
+    _FILE_RE = r'(\w+-\w+/\w+\.(phn|txt|wav|wrd))|' + r'timitdic\.txt|spkrinfo\.txt'
     """A regexp matching fileids that are used by this corpus reader."""
-    _UTTERANCE_RE = r"\w+-\w+/\w+\.txt"
+    _UTTERANCE_RE = r'\w+-\w+/\w+\.txt'
 
-    def __init__(self, root, encoding="utf8"):
+    def __init__(self, root, encoding='utf8'):
         """
         Construct a new TIMIT corpus reader in the given directory.
         :param root: The root directory for this corpus.
         """
         # Ensure that wave files don't get treated as unicode data:
-        if isinstance(encoding, str):
-            encoding = [(".*\.wav", None), (".*", encoding)]
+        if isinstance(encoding, string_types):
+            encoding = [('.*\.wav', None), ('.*', encoding)]
 
         CorpusReader.__init__(
             self, root, find_corpus_fileids(root, self._FILE_RE), encoding=encoding
@@ -174,7 +179,7 @@ class TimitCorpusReader(CorpusReader):
 
         self._speakerinfo = None
         self._root = root
-        self.speakers = sorted(set(u.split("/")[0] for u in self._utterances))
+        self.speakers = sorted(set(u.split('/')[0] for u in self._utterances))
 
     def fileids(self, filetype=None):
         """
@@ -188,12 +193,12 @@ class TimitCorpusReader(CorpusReader):
         """
         if filetype is None:
             return CorpusReader.fileids(self)
-        elif filetype in ("txt", "wrd", "phn", "wav"):
-            return ["%s.%s" % (u, filetype) for u in self._utterances]
-        elif filetype == "metadata":
-            return ["timitdic.txt", "spkrinfo.txt"]
+        elif filetype in ('txt', 'wrd', 'phn', 'wav'):
+            return ['%s.%s' % (u, filetype) for u in self._utterances]
+        elif filetype == 'metadata':
+            return ['timitdic.txt', 'spkrinfo.txt']
         else:
-            raise ValueError("Bad value for filetype: %r" % filetype)
+            raise ValueError('Bad value for filetype: %r' % filetype)
 
     def utteranceids(
         self, dialect=None, sex=None, spkrid=None, sent_type=None, sentid=None
@@ -204,15 +209,15 @@ class TimitCorpusReader(CorpusReader):
         region, gender, sentence type, or sentence number, if
         specified.
         """
-        if isinstance(dialect, str):
+        if isinstance(dialect, string_types):
             dialect = [dialect]
-        if isinstance(sex, str):
+        if isinstance(sex, string_types):
             sex = [sex]
-        if isinstance(spkrid, str):
+        if isinstance(spkrid, string_types):
             spkrid = [spkrid]
-        if isinstance(sent_type, str):
+        if isinstance(sent_type, string_types):
             sent_type = [sent_type]
-        if isinstance(sentid, str):
+        if isinstance(sentid, string_types):
             sentid = [sentid]
 
         utterances = self._utterances[:]
@@ -234,23 +239,23 @@ class TimitCorpusReader(CorpusReader):
         each word.
         """
         _transcriptions = {}
-        for line in self.open("timitdic.txt"):
-            if not line.strip() or line[0] == ";":
+        for line in self.open('timitdic.txt'):
+            if not line.strip() or line[0] == ';':
                 continue
-            m = re.match(r"\s*(\S+)\s+/(.*)/\s*$", line)
+            m = re.match(r'\s*(\S+)\s+/(.*)/\s*$', line)
             if not m:
-                raise ValueError("Bad line: %r" % line)
+                raise ValueError('Bad line: %r' % line)
             _transcriptions[m.group(1)] = m.group(2).split()
         return _transcriptions
 
     def spkrid(self, utterance):
-        return utterance.split("/")[0]
+        return utterance.split('/')[0]
 
     def sentid(self, utterance):
-        return utterance.split("/")[1]
+        return utterance.split('/')[1]
 
     def utterance(self, spkrid, sentid):
-        return "%s/%s" % (spkrid, sentid)
+        return '%s/%s' % (spkrid, sentid)
 
     def spkrutteranceids(self, speaker):
         """
@@ -260,7 +265,7 @@ class TimitCorpusReader(CorpusReader):
         return [
             utterance
             for utterance in self._utterances
-            if utterance.startswith(speaker + "/")
+            if utterance.startswith(speaker + '/')
         ]
 
     def spkrinfo(self, speaker):
@@ -272,8 +277,8 @@ class TimitCorpusReader(CorpusReader):
 
         if self._speakerinfo is None:
             self._speakerinfo = {}
-            for line in self.open("spkrinfo.txt"):
-                if not line.strip() or line[0] == ";":
+            for line in self.open('spkrinfo.txt'):
+                if not line.strip() or line[0] == ';':
                     continue
                 rec = line.strip().split(None, 9)
                 key = "dr%s-%s%s" % (rec[2], rec[1].lower(), rec[0].lower())
@@ -284,7 +289,7 @@ class TimitCorpusReader(CorpusReader):
     def phones(self, utterances=None):
         return [
             line.split()[-1]
-            for fileid in self._utterance_fileids(utterances, ".phn")
+            for fileid in self._utterance_fileids(utterances, '.phn')
             for line in self.open(fileid)
             if line.strip()
         ]
@@ -295,7 +300,7 @@ class TimitCorpusReader(CorpusReader):
         """
         return [
             (line.split()[2], int(line.split()[0]), int(line.split()[1]))
-            for fileid in self._utterance_fileids(utterances, ".phn")
+            for fileid in self._utterance_fileids(utterances, '.phn')
             for line in self.open(fileid)
             if line.strip()
         ]
@@ -303,7 +308,7 @@ class TimitCorpusReader(CorpusReader):
     def words(self, utterances=None):
         return [
             line.split()[-1]
-            for fileid in self._utterance_fileids(utterances, ".wrd")
+            for fileid in self._utterance_fileids(utterances, '.wrd')
             for line in self.open(fileid)
             if line.strip()
         ]
@@ -311,7 +316,7 @@ class TimitCorpusReader(CorpusReader):
     def word_times(self, utterances=None):
         return [
             (line.split()[2], int(line.split()[0]), int(line.split()[1]))
-            for fileid in self._utterance_fileids(utterances, ".wrd")
+            for fileid in self._utterance_fileids(utterances, '.wrd')
             for line in self.open(fileid)
             if line.strip()
         ]
@@ -319,7 +324,7 @@ class TimitCorpusReader(CorpusReader):
     def sents(self, utterances=None):
         return [
             [line.split()[-1] for line in self.open(fileid) if line.strip()]
-            for fileid in self._utterance_fileids(utterances, ".wrd")
+            for fileid in self._utterance_fileids(utterances, '.wrd')
         ]
 
     def sent_times(self, utterances=None):
@@ -329,7 +334,7 @@ class TimitCorpusReader(CorpusReader):
                 int(line.split()[0]),
                 int(line.split()[1]),
             )
-            for fileid in self._utterance_fileids(utterances, ".txt")
+            for fileid in self._utterance_fileids(utterances, '.txt')
             for line in self.open(fileid)
             if line.strip()
         ]
@@ -337,7 +342,7 @@ class TimitCorpusReader(CorpusReader):
     def phone_trees(self, utterances=None):
         if utterances is None:
             utterances = self._utterances
-        if isinstance(utterances, str):
+        if isinstance(utterances, string_types):
             utterances = [utterances]
 
         trees = []
@@ -348,7 +353,7 @@ class TimitCorpusReader(CorpusReader):
 
             while sent_times:
                 (sent, sent_start, sent_end) = sent_times.pop(0)
-                trees.append(Tree("S", []))
+                trees.append(Tree('S', []))
                 while (
                     word_times and phone_times and phone_times[0][2] <= word_times[0][1]
                 ):
@@ -367,9 +372,9 @@ class TimitCorpusReader(CorpusReader):
     # fileids.
     def wav(self, utterance, start=0, end=None):
         # nltk.chunk conflicts with the stdlib module 'chunk'
-        wave = import_from_stdlib("wave")
+        wave = import_from_stdlib('wave')
 
-        w = wave.open(self.open(utterance + ".wav"), "rb")
+        w = wave.open(self.open(utterance + '.wav'), 'rb')
 
         if end is None:
             end = w.getnframes()
@@ -381,7 +386,7 @@ class TimitCorpusReader(CorpusReader):
         # Open a new temporary file -- the wave module requires
         # an actual file, and won't work w/ stringio. :(
         tf = tempfile.TemporaryFile()
-        out = wave.open(tf, "w")
+        out = wave.open(tf, 'w')
 
         # Write the parameters & data to the new file.
         out.setparams(w.getparams())
@@ -397,17 +402,17 @@ class TimitCorpusReader(CorpusReader):
         assert end is None or end > start
         headersize = 44
         if end is None:
-            data = self.open(utterance + ".wav").read()
+            data = self.open(utterance + '.wav').read()
         else:
-            data = self.open(utterance + ".wav").read(headersize + end * 2)
+            data = self.open(utterance + '.wav').read(headersize + end * 2)
         return data[headersize + start * 2 :]
 
     def _utterance_fileids(self, utterances, extension):
         if utterances is None:
             utterances = self._utterances
-        if isinstance(utterances, str):
+        if isinstance(utterances, string_types):
             utterances = [utterances]
-        return ["%s%s" % (u, extension) for u in utterances]
+        return ['%s%s' % (u, extension) for u in utterances]
 
     def play(self, utterance, start=0, end=None):
         """
@@ -420,7 +425,7 @@ class TimitCorpusReader(CorpusReader):
             import ossaudiodev
 
             try:
-                dsp = ossaudiodev.open("w")
+                dsp = ossaudiodev.open('w')
                 dsp.setfmt(ossaudiodev.AFMT_S16_LE)
                 dsp.channels(1)
                 dsp.speed(16000)
@@ -460,6 +465,7 @@ class TimitCorpusReader(CorpusReader):
         )
 
 
+@compat.python_2_unicode_compatible
 class SpeakerInfo(object):
     def __init__(
         self, id, sex, dr, use, recdate, birthdate, ht, race, edu, comments=None
@@ -476,9 +482,9 @@ class SpeakerInfo(object):
         self.comments = comments
 
     def __repr__(self):
-        attribs = "id sex dr use recdate birthdate ht race edu comments"
-        args = ["%s=%r" % (attr, getattr(self, attr)) for attr in attribs.split()]
-        return "SpeakerInfo(%s)" % (", ".join(args))
+        attribs = 'id sex dr use recdate birthdate ht race edu comments'
+        args = ['%s=%r' % (attr, getattr(self, attr)) for attr in attribs.split()]
+        return 'SpeakerInfo(%s)' % (', '.join(args))
 
 
 def read_timit_block(stream):
@@ -489,5 +495,5 @@ def read_timit_block(stream):
     line = stream.readline()
     if not line:
         return []
-    n, sent = line.split(" ", 1)
+    n, sent = line.split(' ', 1)
     return [sent]
index aead10b..32acc01 100644 (file)
@@ -1,6 +1,6 @@
 # Natural Language Toolkit: Toolbox Reader
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Greg Aumann <greg_aumann@sil.org>
 #         Stuart Robinson <Stuart.Robinson@mpi.nl>
 #         Steven Bird <stevenbird1@gmail.com>
@@ -31,8 +31,8 @@ class ToolboxCorpusReader(CorpusReader):
         fileids,
         strip=True,
         unwrap=True,
-        encoding="utf8",
-        errors="strict",
+        encoding='utf8',
+        errors='strict',
         unicode_fields=None,
     ):
         return concat(
@@ -48,11 +48,11 @@ class ToolboxCorpusReader(CorpusReader):
 
     # should probably be done lazily:
     def entries(self, fileids, **kwargs):
-        if "key" in kwargs:
-            key = kwargs["key"]
-            del kwargs["key"]
+        if 'key' in kwargs:
+            key = kwargs['key']
+            del kwargs['key']
         else:
-            key = "lx"  # the default key in MDF
+            key = 'lx'  # the default key in MDF
         entries = []
         for marker, contents in self.fields(fileids, **kwargs):
             if marker == key:
@@ -64,13 +64,13 @@ class ToolboxCorpusReader(CorpusReader):
                     pass
         return entries
 
-    def words(self, fileids, key="lx"):
+    def words(self, fileids, key='lx'):
         return [contents for marker, contents in self.fields(fileids) if marker == key]
 
     def raw(self, fileids):
         if fileids is None:
             fileids = self._fileids
-        elif isinstance(fileids, str):
+        elif isinstance(fileids, string_types):
             fileids = [fileids]
         return concat([self.open(f).read() for f in fileids])
 
@@ -79,5 +79,5 @@ def demo():
     pass
 
 
-if __name__ == "__main__":
+if __name__ == '__main__':
     demo()
index 7f9b7b7..78b9de3 100644 (file)
@@ -1,6 +1,6 @@
 # Natural Language Toolkit: Twitter Corpus Reader
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Ewan Klein <ewan@inf.ed.ac.uk>
 # URL: <http://nltk.org/>
 # For license information, see LICENSE.TXT
@@ -13,6 +13,8 @@ have been serialised into line-delimited JSON.
 import json
 import os
 
+from six import string_types
+
 from nltk.tokenize import TweetTokenizer
 
 from nltk.corpus.reader.util import StreamBackedCorpusView, concat, ZipFilePathPointer
@@ -57,7 +59,7 @@ class TwitterCorpusReader(CorpusReader):
     """
 
     def __init__(
-        self, root, fileids=None, word_tokenizer=TweetTokenizer(), encoding="utf8"
+        self, root, fileids=None, word_tokenizer=TweetTokenizer(), encoding='utf8'
     ):
         """
 
@@ -108,7 +110,7 @@ class TwitterCorpusReader(CorpusReader):
         tweets = []
         for jsono in fulltweets:
             try:
-                text = jsono["text"]
+                text = jsono['text']
                 if isinstance(text, bytes):
                     text = text.decode(self.encoding)
                 tweets.append(text)
@@ -133,7 +135,7 @@ class TwitterCorpusReader(CorpusReader):
         """
         if fileids is None:
             fileids = self._fileids
-        elif isinstance(fileids, str):
+        elif isinstance(fileids, string_types):
             fileids = [fileids]
         return concat([self.open(f).read() for f in fileids])
 
index 4bfb551..934a5b5 100644 (file)
@@ -2,6 +2,7 @@
 """
 UDHR corpus reader. It mostly deals with encodings.
 """
+from __future__ import absolute_import, unicode_literals
 
 from nltk.corpus.reader.util import find_corpus_fileids
 from nltk.corpus.reader.plaintext import PlaintextCorpusReader
@@ -10,65 +11,65 @@ from nltk.corpus.reader.plaintext import PlaintextCorpusReader
 class UdhrCorpusReader(PlaintextCorpusReader):
 
     ENCODINGS = [
-        (".*-Latin1$", "latin-1"),
-        (".*-Hebrew$", "hebrew"),
-        (".*-Arabic$", "cp1256"),
-        ("Czech_Cesky-UTF8", "cp1250"),  # yeah
-        (".*-Cyrillic$", "cyrillic"),
-        (".*-SJIS$", "SJIS"),
-        (".*-GB2312$", "GB2312"),
-        (".*-Latin2$", "ISO-8859-2"),
-        (".*-Greek$", "greek"),
-        (".*-UTF8$", "utf-8"),
-        ("Hungarian_Magyar-Unicode", "utf-16-le"),
-        ("Amahuaca", "latin1"),
-        ("Turkish_Turkce-Turkish", "latin5"),
-        ("Lithuanian_Lietuviskai-Baltic", "latin4"),
-        ("Japanese_Nihongo-EUC", "EUC-JP"),
-        ("Japanese_Nihongo-JIS", "iso2022_jp"),
-        ("Chinese_Mandarin-HZ", "hz"),
-        ("Abkhaz\-Cyrillic\+Abkh", "cp1251"),
+        ('.*-Latin1$', 'latin-1'),
+        ('.*-Hebrew$', 'hebrew'),
+        ('.*-Arabic$', 'cp1256'),
+        ('Czech_Cesky-UTF8', 'cp1250'),  # yeah
+        ('.*-Cyrillic$', 'cyrillic'),
+        ('.*-SJIS$', 'SJIS'),
+        ('.*-GB2312$', 'GB2312'),
+        ('.*-Latin2$', 'ISO-8859-2'),
+        ('.*-Greek$', 'greek'),
+        ('.*-UTF8$', 'utf-8'),
+        ('Hungarian_Magyar-Unicode', 'utf-16-le'),
+        ('Amahuaca', 'latin1'),
+        ('Turkish_Turkce-Turkish', 'latin5'),
+        ('Lithuanian_Lietuviskai-Baltic', 'latin4'),
+        ('Japanese_Nihongo-EUC', 'EUC-JP'),
+        ('Japanese_Nihongo-JIS', 'iso2022_jp'),
+        ('Chinese_Mandarin-HZ', 'hz'),
+        ('Abkhaz\-Cyrillic\+Abkh', 'cp1251'),
     ]
 
     SKIP = set(
         [
             # The following files are not fully decodable because they
             # were truncated at wrong bytes:
-            "Burmese_Myanmar-UTF8",
-            "Japanese_Nihongo-JIS",
-            "Chinese_Mandarin-HZ",
-            "Chinese_Mandarin-UTF8",
-            "Gujarati-UTF8",
-            "Hungarian_Magyar-Unicode",
-            "Lao-UTF8",
-            "Magahi-UTF8",
-            "Marathi-UTF8",
-            "Tamil-UTF8",
+            'Burmese_Myanmar-UTF8',
+            'Japanese_Nihongo-JIS',
+            'Chinese_Mandarin-HZ',
+            'Chinese_Mandarin-UTF8',
+            'Gujarati-UTF8',
+            'Hungarian_Magyar-Unicode',
+            'Lao-UTF8',
+            'Magahi-UTF8',
+            'Marathi-UTF8',
+            'Tamil-UTF8',
             # Unfortunately, encodings required for reading
             # the following files are not supported by Python:
-            "Vietnamese-VPS",
-            "Vietnamese-VIQR",
-            "Vietnamese-TCVN",
-            "Magahi-Agra",
-            "Bhojpuri-Agra",
-            "Esperanto-T61",  # latin3 raises an exception
+            'Vietnamese-VPS',
+            'Vietnamese-VIQR',
+            'Vietnamese-TCVN',
+            'Magahi-Agra',
+            'Bhojpuri-Agra',
+            'Esperanto-T61',  # latin3 raises an exception
             # The following files are encoded for specific fonts:
-            "Burmese_Myanmar-WinResearcher",
-            "Armenian-DallakHelv",
-            "Tigrinya_Tigrigna-VG2Main",
-            "Amharic-Afenegus6..60375",  # ?
-            "Navaho_Dine-Navajo-Navaho-font",
+            'Burmese_Myanmar-WinResearcher',
+            'Armenian-DallakHelv',
+            'Tigrinya_Tigrigna-VG2Main',
+            'Amharic-Afenegus6..60375',  # ?
+            'Navaho_Dine-Navajo-Navaho-font',
             # What are these?
-            "Azeri_Azerbaijani_Cyrillic-Az.Times.Cyr.Normal0117",
-            "Azeri_Azerbaijani_Latin-Az.Times.Lat0117",
+            'Azeri_Azerbaijani_Cyrillic-Az.Times.Cyr.Normal0117',
+            'Azeri_Azerbaijani_Latin-Az.Times.Lat0117',
             # The following files are unintended:
-            "Czech-Latin2-err",
-            "Russian_Russky-UTF8~",
+            'Czech-Latin2-err',
+            'Russian_Russky-UTF8~',
         ]
     )
 
-    def __init__(self, root="udhr"):
-        fileids = find_corpus_fileids(root, r"(?!README|\.).*")
+    def __init__(self, root='udhr'):
+        fileids = find_corpus_fileids(root, r'(?!README|\.).*')
         super(UdhrCorpusReader, self).__init__(
             root,
             [fileid for fileid in fileids if fileid not in self.SKIP],
index b85c33b..b60f7ab 100644 (file)
@@ -1,6 +1,6 @@
 # Natural Language Toolkit: Corpus Reader Utilities
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Steven Bird <stevenbird1@gmail.com>
 #         Edward Loper <edloper@gmail.com>
 # URL: <http://nltk.org/>
@@ -10,9 +10,19 @@ import os
 import bisect
 import re
 import tempfile
-import pickle
 from functools import reduce
-from xml.etree import ElementTree
+
+try:
+    import cPickle as pickle
+except ImportError:
+    import pickle
+
+try:  # Use the c version of ElementTree, which is faster, if possible.
+    from xml.etree import cElementTree as ElementTree
+except ImportError:
+    from xml.etree import ElementTree
+
+from six import string_types, text_type
 
 from nltk.tokenize import wordpunct_tokenize
 from nltk.internals import slice_bounds
@@ -118,7 +128,7 @@ class StreamBackedCorpusView(AbstractLazySequence):
        block; and tokens is a list of the tokens in the block.
     """
 
-    def __init__(self, fileid, block_reader=None, startpos=0, encoding="utf8"):
+    def __init__(self, fileid, block_reader=None, startpos=0, encoding='utf8'):
         """
         Create a new corpus view, based on the file ``fileid``, and
         read with ``block_reader``.  See the class documentation
@@ -170,7 +180,7 @@ class StreamBackedCorpusView(AbstractLazySequence):
             else:
                 self._eofpos = os.stat(self._fileid).st_size
         except Exception as exc:
-            raise ValueError("Unable to open or access %r -- %s" % (fileid, exc))
+            raise ValueError('Unable to open or access %r -- %s' % (fileid, exc))
 
         # Maintain a cache of the most recently read block, to
         # increase efficiency of random access.
@@ -193,7 +203,7 @@ class StreamBackedCorpusView(AbstractLazySequence):
         :param stream: an input stream
         :type stream: stream
         """
-        raise NotImplementedError("Abstract Method")
+        raise NotImplementedError('Abstract Method')
 
     def _open(self):
         """
@@ -205,10 +215,10 @@ class StreamBackedCorpusView(AbstractLazySequence):
             self._stream = self._fileid.open(self._encoding)
         elif self._encoding:
             self._stream = SeekableUnicodeStreamReader(
-                open(self._fileid, "rb"), self._encoding
+                open(self._fileid, 'rb'), self._encoding
             )
         else:
-            self._stream = open(self._fileid, "rb")
+            self._stream = open(self._fileid, 'rb')
 
     def close(self):
         """
@@ -245,7 +255,7 @@ class StreamBackedCorpusView(AbstractLazySequence):
             if i < 0:
                 i += len(self)
             if i < 0:
-                raise IndexError("index out of range")
+                raise IndexError('index out of range')
             # Check if it's in the cache.
             offset = self._cache[0]
             if offset <= i < self._cache[1]:
@@ -254,7 +264,7 @@ class StreamBackedCorpusView(AbstractLazySequence):
             try:
                 return next(self.iterate_from(i))
             except StopIteration:
-                raise IndexError("index out of range")
+                raise IndexError('index out of range')
 
     # If we wanted to be thread-safe, then this method would need to
     # do some locking.
@@ -295,13 +305,13 @@ class StreamBackedCorpusView(AbstractLazySequence):
             self._current_blocknum = block_index
             tokens = self.read_block(self._stream)
             assert isinstance(tokens, (tuple, list, AbstractLazySequence)), (
-                "block reader %s() should return list or tuple."
+                'block reader %s() should return list or tuple.'
                 % self.read_block.__name__
             )
             num_toks = len(tokens)
             new_filepos = self._stream.tell()
             assert new_filepos > filepos, (
-                "block reader %s() should consume at least 1 byte (filepos=%d)"
+                'block reader %s() should consume at least 1 byte (filepos=%d)'
                 % (self.read_block.__name__, filepos)
             )
 
@@ -320,10 +330,10 @@ class StreamBackedCorpusView(AbstractLazySequence):
                     # Check for consistency:
                     assert (
                         new_filepos == self._filepos[block_index]
-                    ), "inconsistent block reader (num chars read)"
+                    ), 'inconsistent block reader (num chars read)'
                     assert (
                         toknum + num_toks == self._toknum[block_index]
-                    ), "inconsistent block reader (num tokens returned)"
+                    ), 'inconsistent block reader (num tokens returned)'
 
             # If we reached the end of the file, then update self._len
             if new_filepos == self._eofpos:
@@ -430,13 +440,13 @@ def concat(docs):
     if len(docs) == 1:
         return docs[0]
     if len(docs) == 0:
-        raise ValueError("concat() expects at least one object!")
+        raise ValueError('concat() expects at least one object!')
 
     types = set(d.__class__ for d in docs)
 
     # If they're all strings, use string concatenation.
-    if all(isinstance(doc, str) for doc in docs):
-        return "".join(docs)
+    if all(isinstance(doc, string_types) for doc in docs):
+        return ''.join(docs)
 
     # If they're all corpus views, then use ConcatenatedCorpusView.
     for typ in types:
@@ -463,7 +473,7 @@ def concat(docs):
             return reduce((lambda a, b: a + b), docs, ())
 
         if ElementTree.iselement(typ):
-            xmltree = ElementTree.Element("documents")
+            xmltree = ElementTree.Element('documents')
             for doc in docs:
                 xmltree.append(doc)
             return xmltree
@@ -524,7 +534,7 @@ class PickleCorpusView(StreamBackedCorpusView):
         fileid.  (This method is called whenever a
         ``PickledCorpusView`` is garbage-collected.
         """
-        if getattr(self, "_delete_on_gc"):
+        if getattr(self, '_delete_on_gc'):
             if os.path.exists(self._fileid):
                 try:
                     os.remove(self._fileid)
@@ -534,8 +544,8 @@ class PickleCorpusView(StreamBackedCorpusView):
 
     @classmethod
     def write(cls, sequence, output_file):
-        if isinstance(output_file, str):
-            output_file = open(output_file, "wb")
+        if isinstance(output_file, string_types):
+            output_file = open(output_file, 'wb')
         for item in sequence:
             pickle.dump(item, output_file, cls.PROTOCOL)
 
@@ -550,13 +560,13 @@ class PickleCorpusView(StreamBackedCorpusView):
             deleted whenever this object gets garbage-collected.
         """
         try:
-            fd, output_file_name = tempfile.mkstemp(".pcv", "nltk-")
-            output_file = os.fdopen(fd, "wb")
+            fd, output_file_name = tempfile.mkstemp('.pcv', 'nltk-')
+            output_file = os.fdopen(fd, 'wb')
             cls.write(sequence, output_file)
             output_file.close()
             return PickleCorpusView(output_file_name, delete_on_gc)
         except (OSError, IOError) as e:
-            raise ValueError("Error while creating temp file: %s" % e)
+            raise ValueError('Error while creating temp file: %s' % e)
 
 
 ######################################################################
@@ -584,12 +594,12 @@ def read_line_block(stream):
         line = stream.readline()
         if not line:
             return toks
-        toks.append(line.rstrip("\n"))
+        toks.append(line.rstrip('\n'))
     return toks
 
 
 def read_blankline_block(stream):
-    s = ""
+    s = ''
     while True:
         line = stream.readline()
         # End of file:
@@ -608,10 +618,10 @@ def read_blankline_block(stream):
 
 
 def read_alignedsent_block(stream):
-    s = ""
+    s = ''
     while True:
         line = stream.readline()
-        if line[0] == "=" or line[0] == "\n" or line[:2] == "\r\n":
+        if line[0] == '=' or line[0] == '\n' or line[:2] == '\r\n':
             continue
         # End of file:
         if not line:
@@ -622,7 +632,7 @@ def read_alignedsent_block(stream):
         # Other line:
         else:
             s += line
-            if re.match("^\d+-\d+", line) is not None:
+            if re.match('^\d+-\d+', line) is not None:
                 return [s]
 
 
@@ -648,15 +658,15 @@ def read_regexp_block(stream, start_re, end_re=None):
         line = stream.readline()
         # End of file:
         if not line:
-            return ["".join(lines)]
+            return [''.join(lines)]
         # End of token:
         if end_re is not None and re.match(end_re, line):
-            return ["".join(lines)]
+            return [''.join(lines)]
         # Start of new token: backup to just before it starts, and
         # return the token we've already collected.
         if end_re is None and re.match(start_re, line):
             stream.seek(oldpos)
-            return ["".join(lines)]
+            return [''.join(lines)]
         # Anything else is part of the token.
         lines.append(line)
 
@@ -682,20 +692,20 @@ def read_sexpr_block(stream, block_size=16384, comment_char=None):
     """
     start = stream.tell()
     block = stream.read(block_size)
-    encoding = getattr(stream, "encoding", None)
-    assert encoding is not None or isinstance(block, str)
-    if encoding not in (None, "utf-8"):
+    encoding = getattr(stream, 'encoding', None)
+    assert encoding is not None or isinstance(block, text_type)
+    if encoding not in (None, 'utf-8'):
         import warnings
 
         warnings.warn(
-            "Parsing may fail, depending on the properties "
-            "of the %s encoding!" % encoding
+            'Parsing may fail, depending on the properties '
+            'of the %s encoding!' % encoding
         )
         # (e.g., the utf-16 encoding does not work because it insists
         # on adding BOMs to the beginning of encoded strings.)
 
     if comment_char:
-        COMMENT = re.compile("(?m)^%s.*$" % re.escape(comment_char))
+        COMMENT = re.compile('(?m)^%s.*$' % re.escape(comment_char))
     while True:
         try:
             # If we're stripping comments, then make sure our block ends
@@ -708,7 +718,7 @@ def read_sexpr_block(stream, block_size=16384, comment_char=None):
             # Read the block.
             tokens, offset = _parse_sexpr_block(block)
             # Skip whitespace
-            offset = re.compile(r"\s*").search(block, offset).end()
+            offset = re.compile(r'\s*').search(block, offset).end()
 
             # Move to the end position.
             if encoding is None:
@@ -719,7 +729,7 @@ def read_sexpr_block(stream, block_size=16384, comment_char=None):
             # Return the list of tokens we processed
             return tokens
         except ValueError as e:
-            if e.args[0] == "Block too small":
+            if e.args[0] == 'Block too small':
                 next_block = stream.read(block_size)
                 if next_block:
                     block += next_block
@@ -734,7 +744,7 @@ def read_sexpr_block(stream, block_size=16384, comment_char=None):
 def _sub_space(m):
     """Helper function: given a regexp match, return a string of
     spaces that's the same length as the matched string."""
-    return " " * (m.end() - m.start())
+    return ' ' * (m.end() - m.start())
 
 
 def _parse_sexpr_block(block):
@@ -742,27 +752,27 @@ def _parse_sexpr_block(block):
     start = end = 0
 
     while end < len(block):
-        m = re.compile(r"\S").search(block, end)
+        m = re.compile(r'\S').search(block, end)
         if not m:
             return tokens, end
 
         start = m.start()
 
         # Case 1: sexpr is not parenthesized.
-        if m.group() != "(":
-            m2 = re.compile(r"[\s(]").search(block, start)
+        if m.group() != '(':
+            m2 = re.compile(r'[\s(]').search(block, start)
             if m2:
                 end = m2.start()
             else:
                 if tokens:
                     return tokens, end
-                raise ValueError("Block too small")
+                raise ValueError('Block too small')
 
         # Case 2: parenthesized sexpr.
         else:
             nesting = 0
-            for m in re.compile(r"[()]").finditer(block, start):
-                if m.group() == "(":
+            for m in re.compile(r'[()]').finditer(block, start):
+                if m.group() == '(':
                     nesting += 1
                 else:
                     nesting -= 1
@@ -772,7 +782,7 @@ def _parse_sexpr_block(block):
             else:
                 if tokens:
                     return tokens, end
-                raise ValueError("Block too small")
+                raise ValueError('Block too small')
 
         tokens.append(block[start:end])
 
@@ -786,8 +796,8 @@ def _parse_sexpr_block(block):
 
 def find_corpus_fileids(root, regexp):
     if not isinstance(root, PathPointer):
-        raise TypeError("find_corpus_fileids: expected a PathPointer")
-    regexp += "$"
+        raise TypeError('find_corpus_fileids: expected a PathPointer')
+    regexp += '$'
 
     # Find fileids in a zipfile: scan the zipfile's namelist.  Filter
     # out entries that end in '/' -- they're directories.
@@ -795,7 +805,7 @@ def find_corpus_fileids(root, regexp):
         fileids = [
             name[len(root.entry) :]
             for name in root.zipfile.namelist()
-            if not name.endswith("/")
+            if not name.endswith('/')
         ]
         items = [name for name in fileids if re.match(regexp, name)]
         return sorted(items)
@@ -807,17 +817,17 @@ def find_corpus_fileids(root, regexp):
         # workaround for py25 which doesn't support followlinks
         kwargs = {}
         if not py25():
-            kwargs = {"followlinks": True}
+            kwargs = {'followlinks': True}
         for dirname, subdirs, fileids in os.walk(root.path, **kwargs):
-            prefix = "".join("%s/" % p for p in _path_from(root.path, dirname))
+            prefix = ''.join('%s/' % p for p in _path_from(root.path, dirname))
             items += [
                 prefix + fileid
                 for fileid in fileids
                 if re.match(regexp, prefix + fileid)
             ]
             # Don't visit svn directories:
-            if ".svn" in subdirs:
-                subdirs.remove(".svn")
+            if '.svn' in subdirs:
+                subdirs.remove('.svn')
         return sorted(items)
 
     else:
@@ -825,7 +835,7 @@ def find_corpus_fileids(root, regexp):
 
 
 def _path_from(parent, child):
-    if os.path.split(parent)[1] == "":
+    if os.path.split(parent)[1] == '':
         parent = os.path.split(parent)[0]
     path = []
     while parent != child:
@@ -842,15 +852,15 @@ def _path_from(parent, child):
 
 def tagged_treebank_para_block_reader(stream):
     # Read the next paragraph.
-    para = ""
+    para = ''
     while True:
         line = stream.readline()
         # End of paragraph:
-        if re.match("======+\s*$", line):
+        if re.match('======+\s*$', line):
             if para.strip():
                 return [para]
         # End of file:
-        elif line == "":
+        elif line == '':
             if para.strip():
                 return [para]
             else:
index 0ab5f59..d0492f5 100644 (file)
@@ -1,6 +1,6 @@
 # Natural Language Toolkit: Verbnet Corpus Reader
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Edward Loper <edloper@gmail.com>
 # URL: <http://nltk.org/>
 # For license information, see LICENSE.TXT
@@ -11,11 +11,14 @@ An NLTK interface to the VerbNet verb lexicon
 For details about VerbNet see:
 https://verbs.colorado.edu/~mpalmer/projects/verbnet.html
 """
+from __future__ import unicode_literals
 
 import re
 import textwrap
 from collections import defaultdict
 
+from six import string_types
+
 from nltk.corpus.reader.xmldocs import XMLCorpusReader
 
 
@@ -57,10 +60,10 @@ class VerbnetCorpusReader(XMLCorpusReader):
         # runs 2-30 times faster.
         self._quick_index()
 
-    _LONGID_RE = re.compile(r"([^\-\.]*)-([\d+.\-]+)$")
+    _LONGID_RE = re.compile(r'([^\-\.]*)-([\d+.\-]+)$')
     """Regular expression that matches (and decomposes) longids"""
 
-    _SHORTID_RE = re.compile(r"[\d+.\-]+$")
+    _SHORTID_RE = re.compile(r'[\d+.\-]+$')
     """Regular expression that matches shortids"""
 
     _INDEX_RE = re.compile(
@@ -78,9 +81,9 @@ class VerbnetCorpusReader(XMLCorpusReader):
             return sorted(self._lemma_to_class.keys())
         else:
             # [xx] should this include subclass members?
-            if isinstance(vnclass, str):
+            if isinstance(vnclass, string_types):
                 vnclass = self.vnclass(vnclass)
-            return [member.get("name") for member in vnclass.findall("MEMBERS/MEMBER")]
+            return [member.get('name') for member in vnclass.findall('MEMBERS/MEMBER')]
 
     def wordnetids(self, vnclass=None):
         """
@@ -91,12 +94,12 @@ class VerbnetCorpusReader(XMLCorpusReader):
             return sorted(self._wordnet_to_class.keys())
         else:
             # [xx] should this include subclass members?
-            if isinstance(vnclass, str):
+            if isinstance(vnclass, string_types):
                 vnclass = self.vnclass(vnclass)
             return sum(
                 [
-                    member.get("wn", "").split()
-                    for member in vnclass.findall("MEMBERS/MEMBER")
+                    member.get('wn', '').split()
+                    for member in vnclass.findall('MEMBERS/MEMBER')
                 ],
                 [],
             )
@@ -123,8 +126,8 @@ class VerbnetCorpusReader(XMLCorpusReader):
         elif classid is not None:
             xmltree = self.vnclass(classid)
             return [
-                subclass.get("ID")
-                for subclass in xmltree.findall("SUBCLASSES/VNSUBCLASS")
+                subclass.get('ID')
+                for subclass in xmltree.findall('SUBCLASSES/VNSUBCLASS')
             ]
         else:
             return sorted(self._class_to_fileid.keys())
@@ -150,17 +153,17 @@ class VerbnetCorpusReader(XMLCorpusReader):
         if classid in self._class_to_fileid:
             fileid = self._class_to_fileid[self.longid(classid)]
             tree = self.xml(fileid)
-            if classid == tree.get("ID"):
+            if classid == tree.get('ID'):
                 return tree
             else:
-                for subclass in tree.findall(".//VNSUBCLASS"):
-                    if classid == subclass.get("ID"):
+                for subclass in tree.findall('.//VNSUBCLASS'):
+                    if classid == subclass.get('ID'):
                         return subclass
                 else:
                     assert False  # we saw it during _index()!
 
         else:
-            raise ValueError("Unknown identifier {}".format(fileid_or_classid))
+            raise ValueError('Unknown identifier {}'.format(fileid_or_classid))
 
     def fileids(self, vnclass_ids=None):
         """
@@ -170,7 +173,7 @@ class VerbnetCorpusReader(XMLCorpusReader):
         """
         if vnclass_ids is None:
             return self._fileids
-        elif isinstance(vnclass_ids, str):
+        elif isinstance(vnclass_ids, string_types):
             return [self._class_to_fileid[self.longid(vnclass_ids)]]
         else:
             return [
@@ -191,17 +194,17 @@ class VerbnetCorpusReader(XMLCorpusReader):
             containing the xml contents of a VerbNet class.
         :return: frames - a list of frame dictionaries
         """
-        if isinstance(vnclass, str):
+        if isinstance(vnclass, string_types):
             vnclass = self.vnclass(vnclass)
         frames = []
-        vnframes = vnclass.findall("FRAMES/FRAME")
+        vnframes = vnclass.findall('FRAMES/FRAME')
         for vnframe in vnframes:
             frames.append(
                 {
-                    "example": self._get_example_within_frame(vnframe),
-                    "description": self._get_description_within_frame(vnframe),
-                    "syntax": self._get_syntactic_list_within_frame(vnframe),
-                    "semantics": self._get_semantics_within_frame(vnframe),
+                    'example': self._get_example_within_frame(vnframe),
+                    'description': self._get_description_within_frame(vnframe),
+                    'syntax': self._get_syntactic_list_within_frame(vnframe),
+                    'semantics': self._get_semantics_within_frame(vnframe),
                 }
             )
         return frames
@@ -216,11 +219,11 @@ class VerbnetCorpusReader(XMLCorpusReader):
             containing the xml contents of a VerbNet class.
         :return: list of subclasses
         """
-        if isinstance(vnclass, str):
+        if isinstance(vnclass, string_types):
             vnclass = self.vnclass(vnclass)
 
         subclasses = [
-            subclass.get("ID") for subclass in vnclass.findall("SUBCLASSES/VNSUBCLASS")
+            subclass.get('ID') for subclass in vnclass.findall('SUBCLASSES/VNSUBCLASS')
         ]
         return subclasses
 
@@ -235,17 +238,17 @@ class VerbnetCorpusReader(XMLCorpusReader):
             containing the xml contents of a VerbNet class.
         :return: themroles: A list of thematic roles in the VerbNet class
         """
-        if isinstance(vnclass, str):
+        if isinstance(vnclass, string_types):
             vnclass = self.vnclass(vnclass)
 
         themroles = []
-        for trole in vnclass.findall("THEMROLES/THEMROLE"):
+        for trole in vnclass.findall('THEMROLES/THEMROLE'):
             themroles.append(
                 {
-                    "type": trole.get("type"),
-                    "modifiers": [
-                        {"value": restr.get("Value"), "type": restr.get("type")}
-                        for restr in trole.findall("SELRESTRS/SELRESTR")
+                    'type': trole.get('type'),
+                    'modifiers': [
+                        {'value': restr.get('Value'), 'type': restr.get('type')}
+                        for restr in trole.findall('SELRESTRS/SELRESTR')
                     ],
                 }
             )
@@ -259,23 +262,23 @@ class VerbnetCorpusReader(XMLCorpusReader):
         """
         Initialize the indexes ``_lemma_to_class``,
         ``_wordnet_to_class``, and ``_class_to_fileid`` by scanning
-        through the corpus fileids.  This is fast if ElementTree
-        uses the C implementation (<0.1 secs), but quite slow (>10 secs)
-        if only the python implementation is available.
+        through the corpus fileids.  This is fast with cElementTree
+        (<0.1 secs), but quite slow (>10 secs) with the python
+        implementation of ElementTree.
         """
         for fileid in self._fileids:
             self._index_helper(self.xml(fileid), fileid)
 
     def _index_helper(self, xmltree, fileid):
         """Helper for ``_index()``"""
-        vnclass = xmltree.get("ID")
+        vnclass = xmltree.get('ID')
         self._class_to_fileid[vnclass] = fileid
         self._shortid_to_longid[self.shortid(vnclass)] = vnclass
-        for member in xmltree.findall("MEMBERS/MEMBER"):
-            self._lemma_to_class[member.get("name")].append(vnclass)
-            for wn in member.get("wn", "").split():
+        for member in xmltree.findall('MEMBERS/MEMBER'):
+            self._lemma_to_class[member.get('name')].append(vnclass)
+            for wn in member.get('wn', '').split():
                 self._wordnet_to_class[wn].append(vnclass)
-        for subclass in xmltree.findall("SUBCLASSES/VNSUBCLASS"):
+        for subclass in xmltree.findall('SUBCLASSES/VNSUBCLASS'):
             self._index_helper(subclass, fileid)
 
     def _quick_index(self):
@@ -285,8 +288,8 @@ class VerbnetCorpusReader(XMLCorpusReader):
         through the corpus fileids.  This doesn't do proper xml parsing,
         but is good enough to find everything in the standard VerbNet
         corpus -- and it runs about 30 times faster than xml parsing
-        (with the python ElementTree; only 2-3 times faster
-        if ElementTree uses the C implementation).
+        (with the python ElementTree; only 2-3 times faster with
+        cElementTree).
         """
         # nb: if we got rid of wordnet_to_class, this would run 2-3
         # times faster.
@@ -305,7 +308,7 @@ class VerbnetCorpusReader(XMLCorpusReader):
                     vnclass = groups[2]  # for <MEMBER> elts.
                     self._shortid_to_longid[self.shortid(vnclass)] = vnclass
                 else:
-                    assert False, "unexpected match condition"
+                    assert False, 'unexpected match condition'
 
     ######################################################################
     # { Identifier conversion
@@ -320,11 +323,11 @@ class VerbnetCorpusReader(XMLCorpusReader):
         if self._LONGID_RE.match(shortid):
             return shortid  # it's already a longid.
         elif not self._SHORTID_RE.match(shortid):
-            raise ValueError("vnclass identifier %r not found" % shortid)
+            raise ValueError('vnclass identifier %r not found' % shortid)
         try:
             return self._shortid_to_longid[shortid]
         except KeyError:
-            raise ValueError("vnclass identifier %r not found" % shortid)
+            raise ValueError('vnclass identifier %r not found' % shortid)
 
     def shortid(self, longid):
         """Returns shortid of a VerbNet class
@@ -338,7 +341,7 @@ class VerbnetCorpusReader(XMLCorpusReader):
         if m:
             return m.group(2)
         else:
-            raise ValueError("vnclass identifier %r not found" % longid)
+            raise ValueError('vnclass identifier %r not found' % longid)
 
     ######################################################################
     # { Frame access utility functions
@@ -357,13 +360,13 @@ class VerbnetCorpusReader(XMLCorpusReader):
         :return: semantics: semantics dictionary
         """
         semantics_within_single_frame = []
-        for pred in vnframe.findall("SEMANTICS/PRED"):
+        for pred in vnframe.findall('SEMANTICS/PRED'):
             arguments = [
-                {"type": arg.get("type"), "value": arg.get("value")}
-                for arg in pred.findall("ARGS/ARG")
+                {'type': arg.get('type'), 'value': arg.get('value')}
+                for arg in pred.findall('ARGS/ARG')
             ]
             semantics_within_single_frame.append(
-                {"predicate_value": pred.get("value"), "arguments": arguments}
+                {'predicate_value': pred.get('value'), 'arguments': arguments}
             )
         return semantics_within_single_frame
 
@@ -376,7 +379,7 @@ class VerbnetCorpusReader(XMLCorpusReader):
             a VerbNet frame.
         :return: example_text: The example sentence for this particular frame
         """
-        example_element = vnframe.find("EXAMPLES/EXAMPLE")
+        example_element = vnframe.find('EXAMPLES/EXAMPLE')
         if example_element is not None:
             example_text = example_element.text
         else:
@@ -393,10 +396,10 @@ class VerbnetCorpusReader(XMLCorpusReader):
             a VerbNet frame.
         :return: description: a description dictionary with members - primary and secondary
         """
-        description_element = vnframe.find("DESCRIPTION")
+        description_element = vnframe.find('DESCRIPTION')
         return {
-            "primary": description_element.attrib["primary"],
-            "secondary": description_element.get("secondary", ""),
+            'primary': description_element.attrib['primary'],
+            'secondary': description_element.get('secondary', ''),
         }
 
     def _get_syntactic_list_within_frame(self, vnframe):
@@ -412,20 +415,20 @@ class VerbnetCorpusReader(XMLCorpusReader):
         :return: syntax_within_single_frame
         """
         syntax_within_single_frame = []
-        for elt in vnframe.find("SYNTAX"):
+        for elt in vnframe.find('SYNTAX'):
             pos_tag = elt.tag
             modifiers = dict()
-            modifiers["value"] = elt.get("value") if "value" in elt.attrib else ""
-            modifiers["selrestrs"] = [
-                {"value": restr.get("Value"), "type": restr.get("type")}
-                for restr in elt.findall("SELRESTRS/SELRESTR")
+            modifiers['value'] = elt.get('value') if 'value' in elt.attrib else ""
+            modifiers['selrestrs'] = [
+                {'value': restr.get('Value'), 'type': restr.get('type')}
+                for restr in elt.findall('SELRESTRS/SELRESTR')
             ]
-            modifiers["synrestrs"] = [
-                {"value": restr.get("Value"), "type": restr.get("type")}
-                for restr in elt.findall("SYNRESTRS/SYNRESTR")
+            modifiers['synrestrs'] = [
+                {'value': restr.get('Value'), 'type': restr.get('type')}
+                for restr in elt.findall('SYNRESTRS/SYNRESTR')
             ]
             syntax_within_single_frame.append(
-                {"pos_tag": pos_tag, "modifiers": modifiers}
+                {'pos_tag': pos_tag, 'modifiers': modifiers}
             )
         return syntax_within_single_frame
 
@@ -442,19 +445,19 @@ class VerbnetCorpusReader(XMLCorpusReader):
         :param vnclass: A VerbNet class identifier; or an ElementTree
         containing the xml contents of a VerbNet class.
         """
-        if isinstance(vnclass, str):
+        if isinstance(vnclass, string_types):
             vnclass = self.vnclass(vnclass)
 
-        s = vnclass.get("ID") + "\n"
-        s += self.pprint_subclasses(vnclass, indent="  ") + "\n"
-        s += self.pprint_members(vnclass, indent="  ") + "\n"
-        s += "  Thematic roles:\n"
-        s += self.pprint_themroles(vnclass, indent="    ") + "\n"
-        s += "  Frames:\n"
-        s += self.pprint_frames(vnclass, indent="    ")
+        s = vnclass.get('ID') + '\n'
+        s += self.pprint_subclasses(vnclass, indent='  ') + '\n'
+        s += self.pprint_members(vnclass, indent='  ') + '\n'
+        s += '  Thematic roles:\n'
+        s += self.pprint_themroles(vnclass, indent='    ') + '\n'
+        s += '  Frames:\n'
+        s += self.pprint_frames(vnclass, indent='    ')
         return s
 
-    def pprint_subclasses(self, vnclass, indent=""):
+    def pprint_subclasses(self, vnclass, indent=''):
         """Returns pretty printed version of subclasses of VerbNet class
 
         Return a string containing a pretty-printed representation of
@@ -463,18 +466,18 @@ class VerbnetCorpusReader(XMLCorpusReader):
         :param vnclass: A VerbNet class identifier; or an ElementTree
             containing the xml contents of a VerbNet class.
         """
-        if isinstance(vnclass, str):
+        if isinstance(vnclass, string_types):
             vnclass = self.vnclass(vnclass)
 
         subclasses = self.subclasses(vnclass)
         if not subclasses:
-            subclasses = ["(none)"]
-        s = "Subclasses: " + " ".join(subclasses)
+            subclasses = ['(none)']
+        s = 'Subclasses: ' + ' '.join(subclasses)
         return textwrap.fill(
-            s, 70, initial_indent=indent, subsequent_indent=indent + "  "
+            s, 70, initial_indent=indent, subsequent_indent=indent + '  '
         )
 
-    def pprint_members(self, vnclass, indent=""):
+    def pprint_members(self, vnclass, indent=''):
         """Returns pretty printed version of members in a VerbNet class
 
         Return a string containing a pretty-printed representation of
@@ -483,18 +486,18 @@ class VerbnetCorpusReader(XMLCorpusReader):
         :param vnclass: A VerbNet class identifier; or an ElementTree
             containing the xml contents of a VerbNet class.
         """
-        if isinstance(vnclass, str):
+        if isinstance(vnclass, string_types):
             vnclass = self.vnclass(vnclass)
 
         members = self.lemmas(vnclass)
         if not members:
-            members = ["(none)"]
-        s = "Members: " + " ".join(members)
+            members = ['(none)']
+        s = 'Members: ' + ' '.join(members)
         return textwrap.fill(
-            s, 70, initial_indent=indent, subsequent_indent=indent + "  "
+            s, 70, initial_indent=indent, subsequent_indent=indent + '  '
         )
 
-    def pprint_themroles(self, vnclass, indent=""):
+    def pprint_themroles(self, vnclass, indent=''):
         """Returns pretty printed version of thematic roles in a VerbNet class
 
         Return a string containing a pretty-printed representation of
@@ -503,22 +506,22 @@ class VerbnetCorpusReader(XMLCorpusReader):
         :param vnclass: A VerbNet class identifier; or an ElementTree
             containing the xml contents of a VerbNet class.
         """
-        if isinstance(vnclass, str):
+        if isinstance(vnclass, string_types):
             vnclass = self.vnclass(vnclass)
 
         pieces = []
         for themrole in self.themroles(vnclass):
-            piece = indent + "* " + themrole.get("type")
+            piece = indent + '* ' + themrole.get('type')
             modifiers = [
-                modifier["value"] + modifier["type"]
-                for modifier in themrole["modifiers"]
+                modifier['value'] + modifier['type']
+                for modifier in themrole['modifiers']
             ]
             if modifiers:
-                piece += "[{}]".format(" ".join(modifiers))
+                piece += '[{}]'.format(' '.join(modifiers))
             pieces.append(piece)
-        return "\n".join(pieces)
+        return '\n'.join(pieces)
 
-    def pprint_frames(self, vnclass, indent=""):
+    def pprint_frames(self, vnclass, indent=''):
         """Returns pretty version of all frames in a VerbNet class
 
         Return a string containing a pretty-printed representation of
@@ -527,14 +530,14 @@ class VerbnetCorpusReader(XMLCorpusReader):
         :param vnclass: A VerbNet class identifier; or an ElementTree
             containing the xml contents of a VerbNet class.
         """
-        if isinstance(vnclass, str):
+        if isinstance(vnclass, string_types):
             vnclass = self.vnclass(vnclass)
         pieces = []
         for vnframe in self.frames(vnclass):
             pieces.append(self._pprint_single_frame(vnframe, indent))
-        return "\n".join(pieces)
+        return '\n'.join(pieces)
 
-    def _pprint_single_frame(self, vnframe, indent=""):
+    def _pprint_single_frame(self, vnframe, indent=''):
         """Returns pretty printed version of a single frame in a VerbNet class
 
         Returns a string containing a pretty-printed representation of
@@ -543,16 +546,16 @@ class VerbnetCorpusReader(XMLCorpusReader):
         :param vnframe: An ElementTree containing the xml contents of
             a VerbNet frame.
         """
-        frame_string = self._pprint_description_within_frame(vnframe, indent) + "\n"
-        frame_string += self._pprint_example_within_frame(vnframe, indent + " ") + "\n"
+        frame_string = self._pprint_description_within_frame(vnframe, indent) + '\n'
+        frame_string += self._pprint_example_within_frame(vnframe, indent + ' ') + '\n'
         frame_string += (
-            self._pprint_syntax_within_frame(vnframe, indent + "  Syntax: ") + "\n"
+            self._pprint_syntax_within_frame(vnframe, indent + '  Syntax: ') + '\n'
         )
-        frame_string += indent + "  Semantics:\n"
-        frame_string += self._pprint_semantics_within_frame(vnframe, indent + "    ")
+        frame_string += indent + '  Semantics:\n'
+        frame_string += self._pprint_semantics_within_frame(vnframe, indent + '    ')
         return frame_string
 
-    def _pprint_example_within_frame(self, vnframe, indent=""):
+    def _pprint_example_within_frame(self, vnframe, indent=''):
         """Returns pretty printed version of example within frame in a VerbNet class
 
         Return a string containing a pretty-printed representation of
@@ -561,10 +564,10 @@ class VerbnetCorpusReader(XMLCorpusReader):
         :param vnframe: An ElementTree containing the xml contents of
             a Verbnet frame.
         """
-        if vnframe["example"]:
-            return indent + " Example: " + vnframe["example"]
+        if vnframe['example']:
+            return indent + ' Example: ' + vnframe['example']
 
-    def _pprint_description_within_frame(self, vnframe, indent=""):
+    def _pprint_description_within_frame(self, vnframe, indent=''):
         """Returns pretty printed version of a VerbNet frame description
 
         Return a string containing a pretty-printed representation of
@@ -573,12 +576,12 @@ class VerbnetCorpusReader(XMLCorpusReader):
         :param vnframe: An ElementTree containing the xml contents of
             a VerbNet frame.
         """
-        description = indent + vnframe["description"]["primary"]
-        if vnframe["description"]["secondary"]:
-            description += " ({})".format(vnframe["description"]["secondary"])
+        description = indent + vnframe['description']['primary']
+        if vnframe['description']['secondary']:
+            description += ' ({})'.format(vnframe['description']['secondary'])
         return description
 
-    def _pprint_syntax_within_frame(self, vnframe, indent=""):
+    def _pprint_syntax_within_frame(self, vnframe, indent=''):
         """Returns pretty printed version of syntax within a frame in a VerbNet class
 
         Return a string containing a pretty-printed representation of
@@ -588,25 +591,25 @@ class VerbnetCorpusReader(XMLCorpusReader):
             a VerbNet frame.
         """
         pieces = []
-        for element in vnframe["syntax"]:
-            piece = element["pos_tag"]
+        for element in vnframe['syntax']:
+            piece = element['pos_tag']
             modifier_list = []
-            if "value" in element["modifiers"] and element["modifiers"]["value"]:
-                modifier_list.append(element["modifiers"]["value"])
+            if 'value' in element['modifiers'] and element['modifiers']['value']:
+                modifier_list.append(element['modifiers']['value'])
             modifier_list += [
-                "{}{}".format(restr["value"], restr["type"])
+                '{}{}'.format(restr['value'], restr['type'])
                 for restr in (
-                    element["modifiers"]["selrestrs"]
-                    + element["modifiers"]["synrestrs"]
+                    element['modifiers']['selrestrs']
+                    + element['modifiers']['synrestrs']
                 )
             ]
             if modifier_list:
-                piece += "[{}]".format(" ".join(modifier_list))
+                piece += '[{}]'.format(' '.join(modifier_list))
             pieces.append(piece)
 
-        return indent + " ".join(pieces)
+        return indent + ' '.join(pieces)
 
-    def _pprint_semantics_within_frame(self, vnframe, indent=""):
+    def _pprint_semantics_within_frame(self, vnframe, indent=''):
         """Returns a pretty printed version of semantics within frame in a VerbNet class
 
         Return a string containing a pretty-printed representation of
@@ -616,9 +619,9 @@ class VerbnetCorpusReader(XMLCorpusReader):
             a VerbNet frame.
         """
         pieces = []
-        for predicate in vnframe["semantics"]:
-            arguments = [argument["value"] for argument in predicate["arguments"]]
+        for predicate in vnframe['semantics']:
+            arguments = [argument['value'] for argument in predicate['arguments']]
             pieces.append(
-                "{}({})".format(predicate["predicate_value"], ", ".join(arguments))
+                '{}({})'.format(predicate['predicate_value'], ', '.join(arguments))
             )
-        return "\n".join("{}* {}".format(indent, piece) for piece in pieces)
+        return '\n'.join('{}* {}'.format(indent, piece) for piece in pieces)
index 0d0d214..31332d7 100644 (file)
@@ -1,11 +1,13 @@
 # -*- coding: utf-8 -*-
 # Natural Language Toolkit: Word List Corpus Reader
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Steven Bird <stevenbird1@gmail.com>
 #         Edward Loper <edloper@gmail.com>
 # URL: <http://nltk.org/>
 # For license information, see LICENSE.TXT
+from six import string_types
+
 from nltk.tokenize import line_tokenize
 
 from nltk.corpus.reader.util import *
@@ -17,7 +19,7 @@ class WordListCorpusReader(CorpusReader):
     List of words, one per line.  Blank lines are ignored.
     """
 
-    def words(self, fileids=None, ignore_lines_startswith="\n"):
+    def words(self, fileids=None, ignore_lines_startswith='\n'):
         return [
             line
             for line in line_tokenize(self.raw(fileids))
@@ -27,7 +29,7 @@ class WordListCorpusReader(CorpusReader):
     def raw(self, fileids=None):
         if fileids is None:
             fileids = self._fileids
-        elif isinstance(fileids, str):
+        elif isinstance(fileids, string_types):
             fileids = [fileids]
         return concat([self.open(f).read() for f in fileids])
 
@@ -52,32 +54,32 @@ class NonbreakingPrefixesCorpusReader(WordListCorpusReader):
     """
 
     available_langs = {
-        "catalan": "ca",
-        "czech": "cs",
-        "german": "de",
-        "greek": "el",
-        "english": "en",
-        "spanish": "es",
-        "finnish": "fi",
-        "french": "fr",
-        "hungarian": "hu",
-        "icelandic": "is",
-        "italian": "it",
-        "latvian": "lv",
-        "dutch": "nl",
-        "polish": "pl",
-        "portuguese": "pt",
-        "romanian": "ro",
-        "russian": "ru",
-        "slovak": "sk",
-        "slovenian": "sl",
-        "swedish": "sv",
-        "tamil": "ta",
+        'catalan': 'ca',
+        'czech': 'cs',
+        'german': 'de',
+        'greek': 'el',
+        'english': 'en',
+        'spanish': 'es',
+        'finnish': 'fi',
+        'french': 'fr',
+        'hungarian': 'hu',
+        'icelandic': 'is',
+        'italian': 'it',
+        'latvian': 'lv',
+        'dutch': 'nl',
+        'polish': 'pl',
+        'portuguese': 'pt',
+        'romanian': 'ro',
+        'russian': 'ru',
+        'slovak': 'sk',
+        'slovenian': 'sl',
+        'swedish': 'sv',
+        'tamil': 'ta',
     }
     # Also, add the lang IDs as the keys.
     available_langs.update({v: v for v in available_langs.values()})
 
-    def words(self, lang=None, fileids=None, ignore_lines_startswith="#"):
+    def words(self, lang=None, fileids=None, ignore_lines_startswith='#'):
         """
         This module returns a list of nonbreaking prefixes for the specified
         language(s).
@@ -95,7 +97,7 @@ class NonbreakingPrefixesCorpusReader(WordListCorpusReader):
         # all languages when fileids==None.
         if lang in self.available_langs:
             lang = self.available_langs[lang]
-            fileids = ["nonbreaking_prefix." + lang]
+            fileids = ['nonbreaking_prefix.' + lang]
         return [
             line
             for line in line_tokenize(self.raw(fileids))
@@ -113,21 +115,21 @@ class UnicharsCorpusReader(WordListCorpusReader):
 
     # These are categories similar to the Perl Unicode Properties
     available_categories = [
-        "Close_Punctuation",
-        "Currency_Symbol",
-        "IsAlnum",
-        "IsAlpha",
-        "IsLower",
-        "IsN",
-        "IsSc",
-        "IsSo",
-        "IsUpper",
-        "Line_Separator",
-        "Number",
-        "Open_Punctuation",
-        "Punctuation",
-        "Separator",
-        "Symbol",
+        'Close_Punctuation',
+        'Currency_Symbol',
+        'IsAlnum',
+        'IsAlpha',
+        'IsLower',
+        'IsN',
+        'IsSc',
+        'IsSo',
+        'IsUpper',
+        'Line_Separator',
+        'Number',
+        'Open_Punctuation',
+        'Punctuation',
+        'Separator',
+        'Symbol',
     ]
 
     def chars(self, category=None, fileids=None):
@@ -146,7 +148,7 @@ class UnicharsCorpusReader(WordListCorpusReader):
         :return: a list of characters given the specific unicode character category
         """
         if category in self.available_categories:
-            fileids = [category + ".txt"]
+            fileids = [category + '.txt']
         return list(self.raw(fileids).strip())
 
 
@@ -165,10 +167,10 @@ class MWAPPDBCorpusReader(WordListCorpusReader):
     :return: a list of tuples of similar lexical terms.
     """
 
-    mwa_ppdb_xxxl_file = "ppdb-1.0-xxxl-lexical.extended.synonyms.uniquepairs"
+    mwa_ppdb_xxxl_file = 'ppdb-1.0-xxxl-lexical.extended.synonyms.uniquepairs'
 
     def entries(self, fileids=mwa_ppdb_xxxl_file):
         """
         :return: a tuple of synonym word pairs.
         """
-        return [tuple(line.split("\t")) for line in line_tokenize(self.raw(fileids))]
+        return [tuple(line.split('\t')) for line in line_tokenize(self.raw(fileids))]
index 3ced8a4..e67664e 100644 (file)
@@ -1,7 +1,7 @@
 # -*- coding: utf-8 -*-
 # Natural Language Toolkit: WordNet
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Steven Bethard <Steven.Bethard@colorado.edu>
 #         Steven Bird <stevenbird1@gmail.com>
 #         Edward Loper <edloper@gmail.com>
@@ -29,6 +29,8 @@ http://compling.hss.ntu.edu.sg/omw/
 
 """
 
+from __future__ import print_function, unicode_literals
+
 import math
 import re
 from itertools import islice, chain
@@ -36,9 +38,13 @@ from functools import total_ordering
 from operator import itemgetter
 from collections import defaultdict, deque
 
+from six import iteritems
+from six.moves import range
+
 from nltk.corpus.reader import CorpusReader
 from nltk.util import binary_search_file as _binary_search_file
 from nltk.probability import FreqDist
+from nltk.compat import python_2_unicode_compatible
 from nltk.internals import deprecated
 
 ######################################################################
@@ -62,7 +68,7 @@ from nltk.internals import deprecated
 _INF = 1e300
 
 # { Part-of-speech constants
-ADJ, ADJ_SAT, ADV, NOUN, VERB = "a", "s", "r", "n", "v"
+ADJ, ADJ_SAT, ADV, NOUN, VERB = 'a', 's', 'r', 'n', 'v'
 # }
 
 POS_LIST = [NOUN, VERB, ADJ, ADV]
@@ -107,7 +113,7 @@ VERB_FRAME_STRINGS = (
     "Something %s INFINITIVE",
 )
 
-SENSENUM_RE = re.compile(r"\.[\d]+\.")
+SENSENUM_RE = re.compile(r'\.[\d]+\.')
 
 
 ######################################################################
@@ -124,76 +130,76 @@ class _WordNetObject(object):
     """A common base class for lemmas and synsets."""
 
     def hypernyms(self):
-        return self._related("@")
+        return self._related('@')
 
     def _hypernyms(self):
-        return self._related("@")
+        return self._related('@')
 
     def instance_hypernyms(self):
-        return self._related("@i")
+        return self._related('@i')
 
     def _instance_hypernyms(self):
-        return self._related("@i")
+        return self._related('@i')
 
     def hyponyms(self):
-        return self._related("~")
+        return self._related('~')
 
     def instance_hyponyms(self):
-        return self._related("~i")
+        return self._related('~i')
 
     def member_holonyms(self):
-        return self._related("#m")
+        return self._related('#m')
 
     def substance_holonyms(self):
-        return self._related("#s")
+        return self._related('#s')
 
     def part_holonyms(self):
-        return self._related("#p")
+        return self._related('#p')
 
     def member_meronyms(self):
-        return self._related("%m")
+        return self._related('%m')
 
     def substance_meronyms(self):
-        return self._related("%s")
+        return self._related('%s')
 
     def part_meronyms(self):
-        return self._related("%p")
+        return self._related('%p')
 
     def topic_domains(self):
-        return self._related(";c")
+        return self._related(';c')
 
     def in_topic_domains(self):
-        return self._related("-c")
+        return self._related('-c')
 
     def region_domains(self):
-        return self._related(";r")
+        return self._related(';r')
 
     def in_region_domains(self):
-        return self._related("-r")
+        return self._related('-r')
 
     def usage_domains(self):
-        return self._related(";u")
+        return self._related(';u')
 
     def in_usage_domains(self):
-        return self._related("-u")
+        return self._related('-u')
 
     def attributes(self):
-        return self._related("=")
+        return self._related('=')
 
     def entailments(self):
-        return self._related("*")
+        return self._related('*')
 
     def causes(self):
-        return self._related(">")
+        return self._related('>')
 
     def also_sees(self):
-        return self._related("^")
+        return self._related('^')
 
     def verb_groups(self):
-        return self._related("$")
+        return self._related('$')
 
     def similar_tos(self):
-        return self._related("&")
+        return self._related('&')
 
     def __hash__(self):
         return hash(self._name)
@@ -208,6 +214,7 @@ class _WordNetObject(object):
         return self._name < other._name
 
 
+@python_2_unicode_compatible
 class Lemma(_WordNetObject):
     """
     The lexical entry for a single morphological form of a
@@ -223,13 +230,13 @@ class Lemma(_WordNetObject):
     'salt.n.03' has the Lemmas 'salt.n.03.salt', 'salt.n.03.saltiness' and
     'salt.n.03.salinity'.
 
-    Lemma attributes, accessible via methods with the same name:
+    Lemma attributes, accessible via methods with the same name::
 
     - name: The canonical name of this lemma.
     - synset: The synset that this lemma belongs to.
     - syntactic_marker: For adjectives, the WordNet string identifying the
       syntactic position relative modified noun. See:
-      https://wordnet.princeton.edu/documentation/wninput5wn
+      http://wordnet.princeton.edu/man/wninput.5WN.html#sect10
       For all other parts of speech, this attribute is None.
     - count: The frequency of this lemma in wordnet.
 
@@ -237,7 +244,7 @@ class Lemma(_WordNetObject):
 
     Lemmas have the following methods for retrieving related Lemmas. They
     correspond to the names for the pointer symbols defined here:
-    https://wordnet.princeton.edu/documentation/wninput5wn
+    http://wordnet.princeton.edu/man/wninput.5WN.html#sect3
     These methods all return lists of Lemmas:
 
     - antonyms
@@ -257,16 +264,16 @@ class Lemma(_WordNetObject):
     """
 
     __slots__ = [
-        "_wordnet_corpus_reader",
-        "_name",
-        "_syntactic_marker",
-        "_synset",
-        "_frame_strings",
-        "_frame_ids",
-        "_lexname_index",
-        "_lex_id",
-        "_lang",
-        "_key",
+        '_wordnet_corpus_reader',
+        '_name',
+        '_syntactic_marker',
+        '_synset',
+        '_frame_strings',
+        '_frame_ids',
+        '_lexname_index',
+        '_lex_id',
+        '_lang',
+        '_key',
     ]
 
     def __init__(
@@ -286,7 +293,7 @@ class Lemma(_WordNetObject):
         self._frame_ids = []
         self._lexname_index = lexname_index
         self._lex_id = lex_id
-        self._lang = "eng"
+        self._lang = 'eng'
 
         self._key = None  # gets set later.
 
@@ -331,15 +338,16 @@ class Lemma(_WordNetObject):
         return self._wordnet_corpus_reader.lemma_count(self)
 
     def antonyms(self):
-        return self._related("!")
+        return self._related('!')
 
     def derivationally_related_forms(self):
-        return self._related("+")
+        return self._related('+')
 
     def pertainyms(self):
-        return self._related("\\")
+        return self._related('\\')
 
 
+@python_2_unicode_compatible
 class Synset(_WordNetObject):
     """Create a Synset from a "<lemma>.<pos>.<number>" string where:
     <lemma> is the word's morphological stem
@@ -364,7 +372,7 @@ class Synset(_WordNetObject):
 
     Synsets have the following methods for retrieving related Synsets.
     They correspond to the names for the pointer symbols defined here:
-    https://wordnet.princeton.edu/documentation/wninput5wn
+    http://wordnet.princeton.edu/man/wninput.5WN.html#sect3
     These methods all return lists of Synsets.
 
     - hypernyms, instance_hypernyms
@@ -394,19 +402,19 @@ class Synset(_WordNetObject):
     """
 
     __slots__ = [
-        "_pos",
-        "_offset",
-        "_name",
-        "_frame_ids",
-        "_lemmas",
-        "_lemma_names",
-        "_definition",
-        "_examples",
-        "_lexname",
-        "_pointers",
-        "_lemma_pointers",
-        "_max_depth",
-        "_min_depth",
+        '_pos',
+        '_offset',
+        '_name',
+        '_frame_ids',
+        '_lemmas',
+        '_lemma_names',
+        '_definition',
+        '_examples',
+        '_lexname',
+        '_pointers',
+        '_lemma_pointers',
+        '_max_depth',
+        '_min_depth',
     ]
 
     def __init__(self, wordnet_corpus_reader):
@@ -451,16 +459,16 @@ class Synset(_WordNetObject):
 
     def _needs_root(self):
         if self._pos == NOUN:
-            if self._wordnet_corpus_reader.get_version() == "1.6":
+            if self._wordnet_corpus_reader.get_version() == '1.6':
                 return True
             else:
                 return False
         elif self._pos == VERB:
             return True
 
-    def lemma_names(self, lang="eng"):
-        """Return all the lemma_names associated with the synset"""
-        if lang == "eng":
+    def lemma_names(self, lang='eng'):
+        '''Return all the lemma_names associated with the synset'''
+        if lang == 'eng':
             return self._lemma_names
         else:
             self._wordnet_corpus_reader._load_lang_data(lang)
@@ -471,9 +479,9 @@ class Synset(_WordNetObject):
             else:
                 return []
 
-    def lemmas(self, lang="eng"):
-        """Return all the lemma objects associated with the synset"""
-        if lang == "eng":
+    def lemmas(self, lang='eng'):
+        '''Return all the lemma objects associated with the synset'''
+        if lang == 'eng':
             return self._lemmas
         else:
             self._wordnet_corpus_reader._load_lang_data(lang)
@@ -659,7 +667,7 @@ class Synset(_WordNetObject):
         synsets = self.common_hypernyms(other)
         if simulate_root:
             fake_synset = Synset(None)
-            fake_synset._name = "*ROOT*"
+            fake_synset._name = '*ROOT*'
             fake_synset.hypernyms = lambda: []
             fake_synset.instance_hypernyms = lambda: []
             synsets.append(fake_synset)
@@ -692,13 +700,13 @@ class Synset(_WordNetObject):
             distances |= hypernym.hypernym_distances(distance + 1, simulate_root=False)
         if simulate_root:
             fake_synset = Synset(None)
-            fake_synset._name = "*ROOT*"
+            fake_synset._name = '*ROOT*'
             fake_synset_distance = max(distances, key=itemgetter(1))[1]
             distances.add((fake_synset, fake_synset_distance + 1))
         return distances
 
     def _shortest_hypernym_paths(self, simulate_root):
-        if self._name == "*ROOT*":
+        if self._name == '*ROOT*':
             return {self: 0}
 
         queue = deque([(self, 0)])
@@ -716,7 +724,7 @@ class Synset(_WordNetObject):
 
         if simulate_root:
             fake_synset = Synset(None)
-            fake_synset._name = "*ROOT*"
+            fake_synset._name = '*ROOT*'
             path[fake_synset] = max(path.values()) + 1
 
         return path
@@ -745,9 +753,9 @@ class Synset(_WordNetObject):
         # For each ancestor synset common to both subject synsets, find the
         # connecting path length. Return the shortest of these.
 
-        inf = float("inf")
+        inf = float('inf')
         path_distance = inf
-        for synset, d1 in dist_dict1.items():
+        for synset, d1 in iteritems(dist_dict1):
             d2 = dist_dict2.get(synset, inf)
             path_distance = min(path_distance, d1 + d2)
 
@@ -854,8 +862,8 @@ class Synset(_WordNetObject):
 
         if self._pos != other._pos:
             raise WordNetError(
-                "Computing the lch similarity requires "
-                "%s and %s to have the same part of speech." % (self, other)
+                'Computing the lch similarity requires '
+                '%s and %s to have the same part of speech.' % (self, other)
             )
 
         need_root = self._needs_root()
@@ -1066,14 +1074,14 @@ class WordNetCorpusReader(CorpusReader):
     A corpus reader used to access wordnet or its variants.
     """
 
-    _ENCODING = "utf8"
+    _ENCODING = 'utf8'
 
     # { Part-of-speech constants
-    ADJ, ADJ_SAT, ADV, NOUN, VERB = "a", "s", "r", "n", "v"
+    ADJ, ADJ_SAT, ADV, NOUN, VERB = 'a', 's', 'r', 'n', 'v'
     # }
 
     # { Filename constants
-    _FILEMAP = {ADJ: "adj", ADV: "adv", NOUN: "noun", VERB: "verb"}
+    _FILEMAP = {ADJ: 'adj', ADV: 'adv', NOUN: 'noun', VERB: 'verb'}
     # }
 
     # { Part of speech constants
@@ -1084,21 +1092,21 @@ class WordNetCorpusReader(CorpusReader):
     #: A list of file identifiers for all the fileids used by this
     #: corpus reader.
     _FILES = (
-        "cntlist.rev",
-        "lexnames",
-        "index.sense",
-        "index.adj",
-        "index.adv",
-        "index.noun",
-        "index.verb",
-        "data.adj",
-        "data.adv",
-        "data.noun",
-        "data.verb",
-        "adj.exc",
-        "adv.exc",
-        "noun.exc",
-        "verb.exc",
+        'cntlist.rev',
+        'lexnames',
+        'index.sense',
+        'index.adj',
+        'index.adv',
+        'index.noun',
+        'index.verb',
+        'data.adj',
+        'data.adv',
+        'data.noun',
+        'data.verb',
+        'adj.exc',
+        'adv.exc',
+        'noun.exc',
+        'verb.exc',
     )
 
     def __init__(self, root, omw_reader):
@@ -1135,7 +1143,7 @@ class WordNetCorpusReader(CorpusReader):
         self._key_synset_file = None
 
         # Load the lexnames
-        for i, line in enumerate(self.open("lexnames")):
+        for i, line in enumerate(self.open('lexnames')):
             index, lexname, _ = line.split()
             assert int(index) == i
             self._lexnames.append(lexname)
@@ -1150,20 +1158,20 @@ class WordNetCorpusReader(CorpusReader):
     # Nasruddin A’aidil Shari, Sim Wei Ying Geraldine, and Soe Lynn
 
     def of2ss(self, of):
-        """ take an id and return the synsets """
+        ''' take an id and return the synsets '''
         return self.synset_from_pos_and_offset(of[-1], int(of[:8]))
 
     def ss2of(self, ss, lang=None):
-        """ return the ID of the synset """
+        ''' return the ID of the synset '''
         pos = ss.pos()
         # Only these 3 WordNets retain the satellite pos tag
-        if lang not in ["nld", "lit", "slk"] and pos == "s":
-            pos = "a"
+        if lang not in ["nld", "lit", "slk"] and pos == 's':
+            pos = 'a'
         return "{:08d}-{}".format(ss.offset(), pos)
 
     def _load_lang_data(self, lang):
-        """ load the wordnet data of the requested language from the file to
-        the cache, _lang_data """
+        ''' load the wordnet data of the requested language from the file to
+        the cache, _lang_data '''
 
         if lang in self._lang_data.keys():
             return
@@ -1171,20 +1179,20 @@ class WordNetCorpusReader(CorpusReader):
         if lang not in self.langs():
             raise WordNetError("Language is not supported.")
 
-        f = self._omw_reader.open("{0:}/wn-data-{0:}.tab".format(lang))
+        f = self._omw_reader.open('{0:}/wn-data-{0:}.tab'.format(lang))
         self.custom_lemmas(f, lang)
         f.close()
 
     def langs(self):
-        """ return a list of languages supported by Multilingual Wordnet """
+        ''' return a list of languages supported by Multilingual Wordnet '''
         import os
 
-        langs = ["eng"]
+        langs = ['eng']
         fileids = self._omw_reader.fileids()
         for fileid in fileids:
             file_name, file_extension = os.path.splitext(fileid)
-            if file_extension == ".tab":
-                langs.append(file_name.split("-")[-1])
+            if file_extension == '.tab':
+                langs.append(file_name.split('-')[-1])
 
         return langs
 
@@ -1192,8 +1200,8 @@ class WordNetCorpusReader(CorpusReader):
         for suffix in self._FILEMAP.values():
 
             # parse each line of the file (ignoring comment lines)
-            for i, line in enumerate(self.open("index.%s" % suffix)):
-                if line.startswith(" "):
+            for i, line in enumerate(self.open('index.%s' % suffix)):
+                if line.startswith(' '):
                     continue
 
                 _iter = iter(line.split())
@@ -1229,8 +1237,8 @@ class WordNetCorpusReader(CorpusReader):
 
                 # raise more informative error with file name and line number
                 except (AssertionError, ValueError) as e:
-                    tup = ("index.%s" % suffix), (i + 1), e
-                    raise WordNetError("file %s, line %i: %s" % tup)
+                    tup = ('index.%s' % suffix), (i + 1), e
+                    raise WordNetError('file %s, line %i: %s' % tup)
 
                 # map lemmas and parts of speech to synsets
                 self._lemma_pos_offset_map[lemma][pos] = synset_offsets
@@ -1241,7 +1249,7 @@ class WordNetCorpusReader(CorpusReader):
         # load the exception file data into memory
         for pos, suffix in self._FILEMAP.items():
             self._exception_map[pos] = {}
-            for line in self.open("%s.exc" % suffix):
+            for line in self.open('%s.exc' % suffix):
                 terms = line.split()
                 self._exception_map[pos][terms[0]] = terms[1:]
         self._exception_map[ADJ_SAT] = self._exception_map[ADJ]
@@ -1264,7 +1272,7 @@ class WordNetCorpusReader(CorpusReader):
     def get_version(self):
         fh = self._data_file(ADJ)
         for line in fh:
-            match = re.search(r"WordNet (\d+\.\d+) Copyright", line)
+            match = re.search(r'WordNet (\d+\.\d+) Copyright', line)
             if match is not None:
                 version = match.group(1)
                 fh.seek(0)
@@ -1274,8 +1282,8 @@ class WordNetCorpusReader(CorpusReader):
     # Loading Lemmas
     #############################################################
 
-    def lemma(self, name, lang="eng"):
-        """Return lemma object that matches the name"""
+    def lemma(self, name, lang='eng'):
+        '''Return lemma object that matches the name'''
         # cannot simply split on first '.',
         # e.g.: '.45_caliber.a.01..45_caliber'
         separator = SENSENUM_RE.search(name).end()
@@ -1286,19 +1294,19 @@ class WordNetCorpusReader(CorpusReader):
         for lemma in synset.lemmas(lang):
             if lemma._name == lemma_name:
                 return lemma
-        raise WordNetError("no lemma %r in %r" % (lemma_name, synset_name))
+        raise WordNetError('no lemma %r in %r' % (lemma_name, synset_name))
 
     def lemma_from_key(self, key):
         # Keys are case sensitive and always lower-case
         key = key.lower()
 
-        lemma_name, lex_sense = key.split("%")
-        pos_number, lexname_index, lex_id, _, _ = lex_sense.split(":")
+        lemma_name, lex_sense = key.split('%')
+        pos_number, lexname_index, lex_id, _, _ = lex_sense.split(':')
         pos = self._pos_names[int(pos_number)]
 
         # open the key -> synset file if necessary
         if self._key_synset_file is None:
-            self._key_synset_file = self.open("index.sense")
+            self._key_synset_file = self.open('index.sense')
 
         # Find the synset for the lemma.
         synset_line = _binary_search_file(self._key_synset_file, key)
@@ -1318,14 +1326,14 @@ class WordNetCorpusReader(CorpusReader):
     #############################################################
     def synset(self, name):
         # split name into lemma, part of speech and synset number
-        lemma, pos, synset_index_str = name.lower().rsplit(".", 2)
+        lemma, pos, synset_index_str = name.lower().rsplit('.', 2)
         synset_index = int(synset_index_str) - 1
 
         # get the offset for this synset
         try:
             offset = self._lemma_pos_offset_map[lemma][pos][synset_index]
         except KeyError:
-            message = "no lemma %r with part of speech %r"
+            message = 'no lemma %r with part of speech %r'
             raise WordNetError(message % (lemma, pos))
         except IndexError:
             n_senses = len(self._lemma_pos_offset_map[lemma][pos])
@@ -1340,13 +1348,13 @@ class WordNetCorpusReader(CorpusReader):
         synset = self.synset_from_pos_and_offset(pos, offset)
 
         # some basic sanity checks on loaded attributes
-        if pos == "s" and synset._pos == "a":
+        if pos == 's' and synset._pos == 'a':
             message = (
-                "adjective satellite requested but only plain "
-                "adjective found for lemma %r"
+                'adjective satellite requested but only plain '
+                'adjective found for lemma %r'
             )
             raise WordNetError(message % lemma)
-        assert synset._pos == pos or (pos == "a" and synset._pos == "s")
+        assert synset._pos == pos or (pos == 'a' and synset._pos == 's')
 
         # Return the synset object.
         return synset
@@ -1359,7 +1367,7 @@ class WordNetCorpusReader(CorpusReader):
         if pos == ADJ_SAT:
             pos = ADJ
         if self._data_file_map.get(pos) is None:
-            fileid = "data.%s" % self._FILEMAP[pos]
+            fileid = 'data.%s' % self._FILEMAP[pos]
             self._data_file_map[pos] = self.open(fileid)
         return self._data_file_map[pos]
 
@@ -1376,7 +1384,7 @@ class WordNetCorpusReader(CorpusReader):
         self._synset_offset_cache[pos][offset] = synset
         return synset
 
-    @deprecated("Use public method synset_from_pos_and_offset() instead")
+    @deprecated('Use public method synset_from_pos_and_offset() instead')
     def _synset_from_pos_and_offset(self, *args, **kwargs):
         """
         Hack to help people like the readers of
@@ -1393,13 +1401,16 @@ class WordNetCorpusReader(CorpusReader):
         try:
 
             # parse out the definitions and examples from the gloss
-            columns_str, gloss = data_file_line.strip().split("|")
-            definition = re.sub(r"[\"].*?[\"]", "", gloss).strip()
-            examples = re.findall(r'"([^"]*)"', gloss)
-            for example in examples:
-                synset._examples.append(example)
-
-            synset._definition = definition.strip("; ")
+            columns_str, gloss = data_file_line.split('|')
+            gloss = gloss.strip()
+            definitions = []
+            for gloss_part in gloss.split(';'):
+                gloss_part = gloss_part.strip()
+                if gloss_part.startswith('"'):
+                    synset._examples.append(gloss_part.strip('"'))
+                else:
+                    definitions.append(gloss_part)
+            synset._definition = '; '.join(definitions)
 
             # split the other info into fields
             _iter = iter(columns_str.split())
@@ -1425,7 +1436,7 @@ class WordNetCorpusReader(CorpusReader):
                 # get the lex_id (used for sense_keys)
                 lex_id = int(_next_token(), 16)
                 # If the lemma has a syntactic marker, extract it.
-                m = re.match(r"(.*?)(\(.*\))?$", lemma_name)
+                m = re.match(r'(.*?)(\(.*\))?$', lemma_name)
                 lemma_name, syn_mark = m.groups()
                 # create the lemma object
                 lemma = Lemma(self, synset, lemma_name, lexname_index, lex_id, syn_mark)
@@ -1439,7 +1450,7 @@ class WordNetCorpusReader(CorpusReader):
                 offset = int(_next_token())
                 pos = _next_token()
                 lemma_ids_str = _next_token()
-                if lemma_ids_str == "0000":
+                if lemma_ids_str == '0000':
                     synset._pointers[symbol].add((pos, offset))
                 else:
                     source_index = int(lemma_ids_str[:2], 16) - 1
@@ -1458,7 +1469,7 @@ class WordNetCorpusReader(CorpusReader):
                 for _ in range(frame_count):
                     # read the plus sign
                     plus = _next_token()
-                    assert plus == "+"
+                    assert plus == '+'
                     # read the frame and lemma number
                     frame_number = int(_next_token())
                     frame_string_fmt = VERB_FRAME_STRINGS[frame_number]
@@ -1477,7 +1488,7 @@ class WordNetCorpusReader(CorpusReader):
 
         # raise a more informative error with line text
         except ValueError as e:
-            raise WordNetError("line %r: %s" % (data_file_line, e))
+            raise WordNetError('line %r: %s' % (data_file_line, e))
 
         # set sense keys for Lemma objects - note that this has to be
         # done afterwards so that the relations are available
@@ -1485,9 +1496,9 @@ class WordNetCorpusReader(CorpusReader):
             if synset._pos == ADJ_SAT:
                 head_lemma = synset.similar_tos()[0]._lemmas[0]
                 head_name = head_lemma._name
-                head_id = "%02d" % head_lemma._lex_id
+                head_id = '%02d' % head_lemma._lex_id
             else:
-                head_name = head_id = ""
+                head_name = head_id = ''
             tup = (
                 lemma._name,
                 WordNetCorpusReader._pos_numbers[synset._pos],
@@ -1496,14 +1507,14 @@ class WordNetCorpusReader(CorpusReader):
                 head_name,
                 head_id,
             )
-            lemma._key = ("%s%%%d:%02d:%02d:%s:%s" % tup).lower()
+            lemma._key = ('%s%%%d:%02d:%02d:%s:%s' % tup).lower()
 
         # the canonical name is based on the first lemma
         lemma_name = synset._lemmas[0]._name.lower()
         offsets = self._lemma_pos_offset_map[lemma_name][synset._pos]
         sense_index = offsets.index(synset._offset)
         tup = lemma_name, synset._pos, sense_index + 1
-        synset._name = "%s.%s.%02i" % tup
+        synset._name = '%s.%s.%02i' % tup
 
         return synset
 
@@ -1512,7 +1523,7 @@ class WordNetCorpusReader(CorpusReader):
         Retrieves synset based on a given sense_key. Sense keys can be
         obtained from lemma.key()
 
-        From https://wordnet.princeton.edu/documentation/senseidx5wn:
+        From https://wordnet.princeton.edu/wordnet/man/senseidx.5WN.html:
         A sense_key is represented as:
             lemma % lex_sense (e.g. 'dog%1:18:01::')
         where lex_sense is encoded as:
@@ -1550,14 +1561,14 @@ class WordNetCorpusReader(CorpusReader):
                 "valid {} could not be extracted from the sense key".format(error)
             )
 
-        synset_id = ".".join([lemma, synset_types[int(ss_type)], lex_id])
+        synset_id = '.'.join([lemma, synset_types[int(ss_type)], lex_id])
         return self.synset(synset_id)
 
     #############################################################
     # Retrieve synsets and lemmas.
     #############################################################
 
-    def synsets(self, lemma, pos=None, lang="eng", check_exceptions=True):
+    def synsets(self, lemma, pos=None, lang='eng', check_exceptions=True):
         """Load all synsets with a given lemma and part of speech tag.
         If no pos is specified, all synsets for all parts of speech
         will be loaded.
@@ -1566,7 +1577,7 @@ class WordNetCorpusReader(CorpusReader):
         """
         lemma = lemma.lower()
 
-        if lang == "eng":
+        if lang == 'eng':
             get_synset = self.synset_from_pos_and_offset
             index = self._lemma_pos_offset_map
             if pos is None:
@@ -1588,13 +1599,13 @@ class WordNetCorpusReader(CorpusReader):
                     synset_list.append(self.of2ss(l))
             return synset_list
 
-    def lemmas(self, lemma, pos=None, lang="eng"):
+    def lemmas(self, lemma, pos=None, lang='eng'):
         """Return all Lemma objects with a name matching the specified lemma
         name and part of speech tag. Matches any part of speech tag if none is
         specified."""
 
         lemma = lemma.lower()
-        if lang == "eng":
+        if lang == 'eng':
             return [
                 lemma_obj
                 for synset in self.synsets(lemma, pos)
@@ -1614,13 +1625,13 @@ class WordNetCorpusReader(CorpusReader):
                         lemmas.append(lemma_obj)
             return lemmas
 
-    def all_lemma_names(self, pos=None, lang="eng"):
+    def all_lemma_names(self, pos=None, lang='eng'):
         """Return all lemma names for all synsets for the given
         part of speech tag and language or languages. If pos is
         not specified, all synsets for all parts of speech will
         be used."""
 
-        if lang == "eng":
+        if lang == 'eng':
             if pos is None:
                 return iter(self._lemma_pos_offset_map)
             else:
@@ -1637,7 +1648,7 @@ class WordNetCorpusReader(CorpusReader):
                     continue
                 lemma.extend(self._lang_data[lang][0][i])
 
-            lemma = iter(set(lemma))
+            lemma = list(set(lemma))
             return lemma
 
     def all_synsets(self, pos=None):
@@ -1661,7 +1672,7 @@ class WordNetCorpusReader(CorpusReader):
             # be moved while we're not looking.
             if pos_tag == ADJ_SAT:
                 pos_tag = ADJ
-            fileid = "data.%s" % self._FILEMAP[pos_tag]
+            fileid = 'data.%s' % self._FILEMAP[pos_tag]
             data_file = self.open(fileid)
 
             try:
@@ -1698,18 +1709,18 @@ class WordNetCorpusReader(CorpusReader):
             else:
                 data_file.close()
 
-    def words(self, lang="eng"):
+    def words(self, lang='eng'):
         """return lemmas of the given language as list of words"""
         return self.all_lemma_names(lang=lang)
 
-    def license(self, lang="eng"):
+    def license(self, lang='eng'):
         """Return the contents of LICENSE (for omw)
            use lang=lang to get the license for an individual language"""
-        if lang == "eng":
+        if lang == 'eng':
             return self.open("LICENSE").read()
         elif lang in self.langs():
             return self._omw_reader.open("{}/LICENSE".format(lang)).read()
-        elif lang == "omw":
+        elif lang == 'omw':
             # under the assumption you don't mean Omwunra-Toqura
             return self._omw_reader.open("LICENSE").read()
         elif lang in self._lang_data:
@@ -1717,14 +1728,14 @@ class WordNetCorpusReader(CorpusReader):
         else:
             raise WordNetError("Language is not supported.")
 
-    def readme(self, lang="omw"):
+    def readme(self, lang='omw'):
         """Return the contents of README (for omw)
            use lang=lang to get the readme for an individual language"""
-        if lang == "eng":
+        if lang == 'eng':
             return self.open("README").read()
         elif lang in self.langs():
             return self._omw_reader.open("{}/README".format(lang)).read()
-        elif lang == "omw":
+        elif lang == 'omw':
             # under the assumption you don't mean Omwunra-Toqura
             return self._omw_reader.open("README").read()
         elif lang in self._lang_data:
@@ -1732,14 +1743,14 @@ class WordNetCorpusReader(CorpusReader):
         else:
             raise WordNetError("Language is not supported.")
 
-    def citation(self, lang="omw"):
+    def citation(self, lang='omw'):
         """Return the contents of citation.bib file (for omw)
            use lang=lang to get the citation for an individual language"""
-        if lang == "eng":
+        if lang == 'eng':
             return self.open("citation.bib").read()
         elif lang in self.langs():
             return self._omw_reader.open("{}/citation.bib".format(lang)).read()
-        elif lang == "omw":
+        elif lang == 'omw':
             # under the assumption you don't mean Omwunra-Toqura
             return self._omw_reader.open("citation.bib").read()
         elif lang in self._lang_data:
@@ -1753,15 +1764,15 @@ class WordNetCorpusReader(CorpusReader):
     def lemma_count(self, lemma):
         """Return the frequency count for this Lemma"""
         # Currently, count is only work for English
-        if lemma._lang != "eng":
+        if lemma._lang != 'eng':
             return 0
         # open the count file if we haven't already
         if self._key_count_file is None:
-            self._key_count_file = self.open("cntlist.rev")
+            self._key_count_file = self.open('cntlist.rev')
         # find the key in the counts file and return the count
         line = _binary_search_file(self._key_count_file, lemma._key)
         if line:
-            return int(line.rsplit(" ", 1)[-1])
+            return int(line.rsplit(' ', 1)[-1])
         else:
             return 0
 
@@ -1836,27 +1847,27 @@ class WordNetCorpusReader(CorpusReader):
 
     MORPHOLOGICAL_SUBSTITUTIONS = {
         NOUN: [
-            ("s", ""),
-            ("ses", "s"),
-            ("ves", "f"),
-            ("xes", "x"),
-            ("zes", "z"),
-            ("ches", "ch"),
-            ("shes", "sh"),
-            ("men", "man"),
-            ("ies", "y"),
+            ('s', ''),
+            ('ses', 's'),
+            ('ves', 'f'),
+            ('xes', 'x'),
+            ('zes', 'z'),
+            ('ches', 'ch'),
+            ('shes', 'sh'),
+            ('men', 'man'),
+            ('ies', 'y'),
         ],
         VERB: [
-            ("s", ""),
-            ("ies", "y"),
-            ("es", "e"),
-            ("es", ""),
-            ("ed", "e"),
-            ("ed", ""),
-            ("ing", "e"),
-            ("ing", ""),
+            ('s', ''),
+            ('ies', 'y'),
+            ('es', 'e'),
+            ('es', ''),
+            ('ed', 'e'),
+            ('ed', ''),
+            ('ing', 'e'),
+            ('ing', ''),
         ],
-        ADJ: [("er", ""), ("est", ""), ("er", "e"), ("est", "e")],
+        ADJ: [('er', ''), ('est', ''), ('er', 'e'), ('est', 'e')],
         ADV: [],
     }
 
@@ -1986,18 +1997,19 @@ class WordNetCorpusReader(CorpusReader):
         :param lang ISO 639-3 code of the language of the tab file
         """
         if len(lang) != 3:
-            raise ValueError("lang should be a (3 character) ISO 639-3 code")
+            raise ValueError('lang should be a (3 character) ISO 639-3 code')
         self._lang_data[lang] = [defaultdict(list), defaultdict(list)]
-        for line in tab_file.readlines():
-            if isinstance(line, bytes):
+        for l in tab_file.readlines():
+            if isinstance(l, bytes):
                 # Support byte-stream files (e.g. as returned by Python 2's
                 # open() function) as well as text-stream ones
-                line = line.decode("utf-8")
-            if not line.startswith("#"):
-                offset_pos, lemma_type, lemma = line.strip().split("\t")
-                lemma = lemma.strip().replace(" ", "_")
-                self._lang_data[lang][0][offset_pos].append(lemma)
-                self._lang_data[lang][1][lemma.lower()].append(offset_pos)
+                l = l.decode('utf-8')
+            l = l.replace('\n', '')
+            l = l.replace(' ', '_')
+            if l[0] != '#':
+                word = l.split('\t')
+                self._lang_data[lang][0][word[0]].append(word[2])
+                self._lang_data[lang][1][word[2].lower()].append(word[0])
         # Make sure no more entries are accidentally added subsequently
         self._lang_data[lang][0].default_factory = None
         self._lang_data[lang][1].default_factory = None
@@ -2014,7 +2026,7 @@ class WordNetICCorpusReader(CorpusReader):
     """
 
     def __init__(self, root, fileids):
-        CorpusReader.__init__(self, root, fileids, encoding="utf8")
+        CorpusReader.__init__(self, root, fileids, encoding='utf8')
 
     # this load function would be more efficient if the data was pickled
     # Note that we can't use NLTK's frequency distributions because
@@ -2111,8 +2123,8 @@ def _lcs_ic(synset1, synset2, ic, verbose=False):
     """
     if synset1._pos != synset2._pos:
         raise WordNetError(
-            "Computing the least common subsumer requires "
-            "%s and %s to have the same part of speech." % (synset1, synset2)
+            'Computing the least common subsumer requires '
+            '%s and %s to have the same part of speech.' % (synset1, synset2)
         )
 
     ic1 = information_content(synset1, ic)
@@ -2136,7 +2148,7 @@ def information_content(synset, ic):
     try:
         icpos = ic[synset._pos]
     except KeyError:
-        msg = "Information content file has no entries for part-of-speech: %s"
+        msg = 'Information content file has no entries for part-of-speech: %s'
         raise WordNetError(msg % synset._pos)
 
     counts = icpos[synset._offset]
@@ -2151,9 +2163,9 @@ def information_content(synset, ic):
 
 
 def _get_pos(field):
-    if field[-1] == "n":
+    if field[-1] == 'n':
         return NOUN
-    elif field[-1] == "v":
+    elif field[-1] == 'v':
         return VERB
     else:
         msg = (
index 6f928b7..8a66720 100644 (file)
@@ -1,6 +1,6 @@
 # Natural Language Toolkit: XML Corpus Reader
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Steven Bird <stevenbird1@gmail.com>
 # URL: <http://nltk.org/>
 # For license information, see LICENSE.TXT
@@ -10,9 +10,17 @@ Corpus reader for corpora whose documents are xml files.
 
 (note -- not named 'xml' to avoid conflicting w/ standard xml package)
 """
+from __future__ import print_function, unicode_literals
 
 import codecs
-from xml.etree import ElementTree
+
+# Use the c version of ElementTree, which is faster, if possible:
+try:
+    from xml.etree import cElementTree as ElementTree
+except ImportError:
+    from xml.etree import ElementTree
+
+from six import string_types
 
 from nltk.data import SeekableUnicodeStreamReader
 from nltk.tokenize import WordPunctTokenizer
@@ -39,8 +47,8 @@ class XMLCorpusReader(CorpusReader):
         # Make sure we have exactly one file -- no concatenating XML.
         if fileid is None and len(self._fileids) == 1:
             fileid = self._fileids[0]
-        if not isinstance(fileid, str):
-            raise TypeError("Expected a single file identifier string")
+        if not isinstance(fileid, string_types):
+            raise TypeError('Expected a single file identifier string')
         # Read the XML in using ElementTree.
         elt = ElementTree.parse(self.abspath(fileid).open()).getroot()
         # If requested, wrap it.
@@ -77,7 +85,7 @@ class XMLCorpusReader(CorpusReader):
     def raw(self, fileids=None):
         if fileids is None:
             fileids = self._fileids
-        elif isinstance(fileids, str):
+        elif isinstance(fileids, string_types):
             fileids = [fileids]
         return concat([self.open(f).read() for f in fileids])
 
@@ -142,7 +150,7 @@ class XMLCorpusView(StreamBackedCorpusView):
         if elt_handler:
             self.handle_elt = elt_handler
 
-        self._tagspec = re.compile(tagspec + r"\Z")
+        self._tagspec = re.compile(tagspec + r'\Z')
         """The tag specification for this corpus view."""
 
         self._tag_context = {0: ()}
@@ -162,18 +170,18 @@ class XMLCorpusView(StreamBackedCorpusView):
             finally:
                 infile.close()
         else:
-            with open(fileid, "rb") as infile:
+            with open(fileid, 'rb') as infile:
                 s = infile.readline()
         if s.startswith(codecs.BOM_UTF16_BE):
-            return "utf-16-be"
+            return 'utf-16-be'
         if s.startswith(codecs.BOM_UTF16_LE):
-            return "utf-16-le"
+            return 'utf-16-le'
         if s.startswith(codecs.BOM_UTF32_BE):
-            return "utf-32-be"
+            return 'utf-32-be'
         if s.startswith(codecs.BOM_UTF32_LE):
-            return "utf-32-le"
+            return 'utf-32-le'
         if s.startswith(codecs.BOM_UTF8):
-            return "utf-8"
+            return 'utf-8'
         m = re.match(br'\s*<\?xml\b.*\bencoding="([^"]+)"', s)
         if m:
             return m.group(1).decode()
@@ -181,7 +189,7 @@ class XMLCorpusView(StreamBackedCorpusView):
         if m:
             return m.group(1).decode()
         # No encoding found -- what should the default be?
-        return "utf-8"
+        return 'utf-8'
 
     def handle_elt(self, elt, context):
         """
@@ -222,7 +230,7 @@ class XMLCorpusView(StreamBackedCorpusView):
 
     #: A regular expression used to extract the tag name from a start tag,
     #: end tag, or empty-elt tag string.
-    _XML_TAG_NAME = re.compile("<\s*/?\s*([^\s>]+)")
+    _XML_TAG_NAME = re.compile('<\s*/?\s*([^\s>]+)')
 
     #: A regular expression used to find all start-tags, end-tags, and
     #: emtpy-elt tags in an XML file.  This regexp is more lenient than
@@ -251,7 +259,7 @@ class XMLCorpusView(StreamBackedCorpusView):
         then this function either backtracks to the last '<', or reads
         another block.
         """
-        fragment = ""
+        fragment = ''
 
         if isinstance(stream, SeekableUnicodeStreamReader):
             startpos = stream.tell()
@@ -265,20 +273,20 @@ class XMLCorpusView(StreamBackedCorpusView):
                 return fragment
 
             # Do we have a fragment that will never be well-formed?
-            if re.search("[<>]", fragment).group(0) == ">":
+            if re.search('[<>]', fragment).group(0) == '>':
                 pos = stream.tell() - (
-                    len(fragment) - re.search("[<>]", fragment).end()
+                    len(fragment) - re.search('[<>]', fragment).end()
                 )
                 raise ValueError('Unexpected ">" near char %s' % pos)
 
             # End of file?
             if not xml_block:
-                raise ValueError("Unexpected end of file: tag not closed")
+                raise ValueError('Unexpected end of file: tag not closed')
 
             # If not, then we must be in the middle of a <..tag..>.
             # If appropriate, backtrack to the most recent '<'
             # character.
-            last_open_bracket = fragment.rfind("<")
+            last_open_bracket = fragment.rfind('<')
             if last_open_bracket > 0:
                 if self._VALID_XML_RE.match(fragment[:last_open_bracket]):
                     if isinstance(stream, SeekableUnicodeStreamReader):
@@ -310,7 +318,7 @@ class XMLCorpusView(StreamBackedCorpusView):
 
         elt_start = None  # where does the elt start
         elt_depth = None  # what context depth
-        elt_text = ""
+        elt_text = ''
 
         while elts == [] or elt_start is not None:
             if isinstance(stream, SeekableUnicodeStreamReader):
@@ -322,46 +330,46 @@ class XMLCorpusView(StreamBackedCorpusView):
                 if elt_start is None:
                     break
                 else:
-                    raise ValueError("Unexpected end of file")
+                    raise ValueError('Unexpected end of file')
 
             # Process each <tag> in the xml fragment.
             for piece in self._XML_PIECE.finditer(xml_fragment):
                 if self._DEBUG:
-                    print("%25s %s" % ("/".join(context)[-20:], piece.group()))
+                    print('%25s %s' % ('/'.join(context)[-20:], piece.group()))
 
-                if piece.group("START_TAG"):
+                if piece.group('START_TAG'):
                     name = self._XML_TAG_NAME.match(piece.group()).group(1)
                     # Keep context up-to-date.
                     context.append(name)
                     # Is this one of the elts we're looking for?
                     if elt_start is None:
-                        if re.match(tagspec, "/".join(context)):
+                        if re.match(tagspec, '/'.join(context)):
                             elt_start = piece.start()
                             elt_depth = len(context)
 
-                elif piece.group("END_TAG"):
+                elif piece.group('END_TAG'):
                     name = self._XML_TAG_NAME.match(piece.group()).group(1)
                     # sanity checks:
                     if not context:
-                        raise ValueError("Unmatched tag </%s>" % name)
+                        raise ValueError('Unmatched tag </%s>' % name)
                     if name != context[-1]:
                         raise ValueError(
-                            "Unmatched tag <%s>...</%s>" % (context[-1], name)
+                            'Unmatched tag <%s>...</%s>' % (context[-1], name)
                         )
                     # Is this the end of an element?
                     if elt_start is not None and elt_depth == len(context):
                         elt_text += xml_fragment[elt_start : piece.end()]
-                        elts.append((elt_text, "/".join(context)))
+                        elts.append((elt_text, '/'.join(context)))
                         elt_start = elt_depth = None
-                        elt_text = ""
+                        elt_text = ''
                     # Keep context up-to-date
                     context.pop()
 
-                elif piece.group("EMPTY_ELT_TAG"):
+                elif piece.group('EMPTY_ELT_TAG'):
                     name = self._XML_TAG_NAME.match(piece.group()).group(1)
                     if elt_start is None:
-                        if re.match(tagspec, "/".join(context) + "/" + name):
-                            elts.append((piece.group(), "/".join(context) + "/" + name))
+                        if re.match(tagspec, '/'.join(context) + '/' + name):
+                            elts.append((piece.group(), '/'.join(context) + '/' + name))
 
             if elt_start is not None:
                 # If we haven't found any elements yet, then keep
@@ -377,7 +385,7 @@ class XMLCorpusView(StreamBackedCorpusView):
                     # take back the last start-tag, and return what
                     # we've gotten so far (elts is non-empty).
                     if self._DEBUG:
-                        print(" " * 36 + "(backtrack)")
+                        print(' ' * 36 + '(backtrack)')
                     if isinstance(stream, SeekableUnicodeStreamReader):
                         stream.seek(startpos)
                         stream.char_seek_forward(elt_start)
@@ -385,7 +393,7 @@ class XMLCorpusView(StreamBackedCorpusView):
                         stream.seek(-(len(xml_fragment) - elt_start), 1)
                     context = context[: elt_depth - 1]
                     elt_start = elt_depth = None
-                    elt_text = ""
+                    elt_text = ''
 
         # Update the _tag_context dict.
         pos = stream.tell()
@@ -396,7 +404,7 @@ class XMLCorpusView(StreamBackedCorpusView):
 
         return [
             elt_handler(
-                ElementTree.fromstring(elt.encode("ascii", "xmlcharrefreplace")),
+                ElementTree.fromstring(elt.encode('ascii', 'xmlcharrefreplace')),
                 context,
             )
             for (elt, context) in elts
index 75ffda2..49a6685 100644 (file)
@@ -22,6 +22,8 @@ to the YCOE standard: http://www-users.york.ac.uk/~lang22/YCOE/YcoeHome.htm
 import os
 import re
 
+from six import string_types
+
 from nltk.tokenize import RegexpTokenizer
 from nltk.corpus.reader.bracket_parse import BracketParseCorpusReader
 from nltk.corpus.reader.tagged import TaggedCorpusReader
@@ -37,22 +39,22 @@ class YCOECorpusReader(CorpusReader):
     corpus of Old English prose texts.
     """
 
-    def __init__(self, root, encoding="utf8"):
+    def __init__(self, root, encoding='utf8'):
         CorpusReader.__init__(self, root, [], encoding)
 
         self._psd_reader = YCOEParseCorpusReader(
-            self.root.join("psd"), ".*", ".psd", encoding=encoding
+            self.root.join('psd'), '.*', '.psd', encoding=encoding
         )
-        self._pos_reader = YCOETaggedCorpusReader(self.root.join("pos"), ".*", ".pos")
+        self._pos_reader = YCOETaggedCorpusReader(self.root.join('pos'), '.*', '.pos')
 
         # Make sure we have a consistent set of items:
         documents = set(f[:-4] for f in self._psd_reader.fileids())
         if set(f[:-4] for f in self._pos_reader.fileids()) != documents:
-            raise ValueError('Items in "psd" and "pos" ' "subdirectories do not match.")
+            raise ValueError('Items in "psd" and "pos" ' 'subdirectories do not match.')
 
         fileids = sorted(
-            ["%s.psd" % doc for doc in documents]
-            + ["%s.pos" % doc for doc in documents]
+            ['%s.psd' % doc for doc in documents]
+            + ['%s.pos' % doc for doc in documents]
         )
         CorpusReader.__init__(self, root, fileids, encoding)
         self._documents = sorted(documents)
@@ -65,11 +67,11 @@ class YCOECorpusReader(CorpusReader):
         """
         if fileids is None:
             return self._documents
-        if isinstance(fileids, str):
+        if isinstance(fileids, string_types):
             fileids = [fileids]
         for f in fileids:
             if f not in self._fileids:
-                raise KeyError("File id %s not found" % fileids)
+                raise KeyError('File id %s not found' % fileids)
         # Strip off the '.pos' and '.psd' extensions.
         return sorted(set(f[:-4] for f in fileids))
 
@@ -80,12 +82,12 @@ class YCOECorpusReader(CorpusReader):
         """
         if documents is None:
             return self._fileids
-        elif isinstance(documents, str):
+        elif isinstance(documents, string_types):
             documents = [documents]
         return sorted(
             set(
-                ["%s.pos" % doc for doc in documents]
-                + ["%s.psd" % doc for doc in documents]
+                ['%s.pos' % doc for doc in documents]
+                + ['%s.psd' % doc for doc in documents]
             )
         )
 
@@ -97,41 +99,41 @@ class YCOECorpusReader(CorpusReader):
         if documents is None:
             documents = self._documents
         else:
-            if isinstance(documents, str):
+            if isinstance(documents, string_types):
                 documents = [documents]
             for document in documents:
                 if document not in self._documents:
-                    if document[-4:] in (".pos", ".psd"):
+                    if document[-4:] in ('.pos', '.psd'):
                         raise ValueError(
-                            "Expected a document identifier, not a file "
-                            "identifier.  (Use corpus.documents() to get "
-                            "a list of document identifiers."
+                            'Expected a document identifier, not a file '
+                            'identifier.  (Use corpus.documents() to get '
+                            'a list of document identifiers.'
                         )
                     else:
-                        raise ValueError("Document identifier %s not found" % document)
-        return ["%s.%s" % (d, subcorpus) for d in documents]
+                        raise ValueError('Document identifier %s not found' % document)
+        return ['%s.%s' % (d, subcorpus) for d in documents]
 
     # Delegate to one of our two sub-readers:
     def words(self, documents=None):
-        return self._pos_reader.words(self._getfileids(documents, "pos"))
+        return self._pos_reader.words(self._getfileids(documents, 'pos'))
 
     def sents(self, documents=None):
-        return self._pos_reader.sents(self._getfileids(documents, "pos"))
+        return self._pos_reader.sents(self._getfileids(documents, 'pos'))
 
     def paras(self, documents=None):
-        return self._pos_reader.paras(self._getfileids(documents, "pos"))
+        return self._pos_reader.paras(self._getfileids(documents, 'pos'))
 
     def tagged_words(self, documents=None):
-        return self._pos_reader.tagged_words(self._getfileids(documents, "pos"))
+        return self._pos_reader.tagged_words(self._getfileids(documents, 'pos'))
 
     def tagged_sents(self, documents=None):
-        return self._pos_reader.tagged_sents(self._getfileids(documents, "pos"))
+        return self._pos_reader.tagged_sents(self._getfileids(documents, 'pos'))
 
     def tagged_paras(self, documents=None):
-        return self._pos_reader.tagged_paras(self._getfileids(documents, "pos"))
+        return self._pos_reader.tagged_paras(self._getfileids(documents, 'pos'))
 
     def parsed_sents(self, documents=None):
-        return self._psd_reader.parsed_sents(self._getfileids(documents, "psd"))
+        return self._psd_reader.parsed_sents(self._getfileids(documents, 'psd'))
 
 
 class YCOEParseCorpusReader(BracketParseCorpusReader):
@@ -139,121 +141,121 @@ class YCOEParseCorpusReader(BracketParseCorpusReader):
     that strips out (CODE ...) and (ID ...) nodes."""
 
     def _parse(self, t):
-        t = re.sub(r"(?u)\((CODE|ID)[^\)]*\)", "", t)
-        if re.match(r"\s*\(\s*\)\s*$", t):
+        t = re.sub(r'(?u)\((CODE|ID)[^\)]*\)', '', t)
+        if re.match(r'\s*\(\s*\)\s*$', t):
             return None
         return BracketParseCorpusReader._parse(self, t)
 
 
 class YCOETaggedCorpusReader(TaggedCorpusReader):
-    def __init__(self, root, items, encoding="utf8"):
-        gaps_re = r"(?u)(?<=/\.)\s+|\s*\S*_CODE\s*|\s*\S*_ID\s*"
+    def __init__(self, root, items, encoding='utf8'):
+        gaps_re = r'(?u)(?<=/\.)\s+|\s*\S*_CODE\s*|\s*\S*_ID\s*'
         sent_tokenizer = RegexpTokenizer(gaps_re, gaps=True)
         TaggedCorpusReader.__init__(
-            self, root, items, sep="_", sent_tokenizer=sent_tokenizer
+            self, root, items, sep='_', sent_tokenizer=sent_tokenizer
         )
 
 
 #: A list of all documents and their titles in ycoe.
 documents = {
-    "coadrian.o34": "Adrian and Ritheus",
-    "coaelhom.o3": "Ælfric, Supplemental Homilies",
-    "coaelive.o3": "Ælfric's Lives of Saints",
-    "coalcuin": "Alcuin De virtutibus et vitiis",
-    "coalex.o23": "Alexander's Letter to Aristotle",
-    "coapollo.o3": "Apollonius of Tyre",
-    "coaugust": "Augustine",
-    "cobede.o2": "Bede's History of the English Church",
-    "cobenrul.o3": "Benedictine Rule",
-    "coblick.o23": "Blickling Homilies",
-    "coboeth.o2": "Boethius' Consolation of Philosophy",
-    "cobyrhtf.o3": "Byrhtferth's Manual",
-    "cocanedgD": "Canons of Edgar (D)",
-    "cocanedgX": "Canons of Edgar (X)",
-    "cocathom1.o3": "Ælfric's Catholic Homilies I",
-    "cocathom2.o3": "Ælfric's Catholic Homilies II",
-    "cochad.o24": "Saint Chad",
-    "cochdrul": "Chrodegang of Metz, Rule",
-    "cochristoph": "Saint Christopher",
-    "cochronA.o23": "Anglo-Saxon Chronicle A",
-    "cochronC": "Anglo-Saxon Chronicle C",
-    "cochronD": "Anglo-Saxon Chronicle D",
-    "cochronE.o34": "Anglo-Saxon Chronicle E",
-    "cocura.o2": "Cura Pastoralis",
-    "cocuraC": "Cura Pastoralis (Cotton)",
-    "codicts.o34": "Dicts of Cato",
-    "codocu1.o1": "Documents 1 (O1)",
-    "codocu2.o12": "Documents 2 (O1/O2)",
-    "codocu2.o2": "Documents 2 (O2)",
-    "codocu3.o23": "Documents 3 (O2/O3)",
-    "codocu3.o3": "Documents 3 (O3)",
-    "codocu4.o24": "Documents 4 (O2/O4)",
-    "coeluc1": "Honorius of Autun, Elucidarium 1",
-    "coeluc2": "Honorius of Autun, Elucidarium 1",
-    "coepigen.o3": "Ælfric's Epilogue to Genesis",
-    "coeuphr": "Saint Euphrosyne",
-    "coeust": "Saint Eustace and his companions",
-    "coexodusP": "Exodus (P)",
-    "cogenesiC": "Genesis (C)",
-    "cogregdC.o24": "Gregory's Dialogues (C)",
-    "cogregdH.o23": "Gregory's Dialogues (H)",
-    "coherbar": "Pseudo-Apuleius, Herbarium",
-    "coinspolD.o34": "Wulfstan's Institute of Polity (D)",
-    "coinspolX": "Wulfstan's Institute of Polity (X)",
-    "cojames": "Saint James",
-    "colacnu.o23": "Lacnunga",
-    "colaece.o2": "Leechdoms",
-    "colaw1cn.o3": "Laws, Cnut I",
-    "colaw2cn.o3": "Laws, Cnut II",
-    "colaw5atr.o3": "Laws, Æthelred V",
-    "colaw6atr.o3": "Laws, Æthelred VI",
-    "colawaf.o2": "Laws, Alfred",
-    "colawafint.o2": "Alfred's Introduction to Laws",
-    "colawger.o34": "Laws, Gerefa",
-    "colawine.ox2": "Laws, Ine",
-    "colawnorthu.o3": "Northumbra Preosta Lagu",
-    "colawwllad.o4": "Laws, William I, Lad",
-    "coleofri.o4": "Leofric",
-    "colsigef.o3": "Ælfric's Letter to Sigefyrth",
-    "colsigewB": "Ælfric's Letter to Sigeweard (B)",
-    "colsigewZ.o34": "Ælfric's Letter to Sigeweard (Z)",
-    "colwgeat": "Ælfric's Letter to Wulfgeat",
-    "colwsigeT": "Ælfric's Letter to Wulfsige (T)",
-    "colwsigeXa.o34": "Ælfric's Letter to Wulfsige (Xa)",
-    "colwstan1.o3": "Ælfric's Letter to Wulfstan I",
-    "colwstan2.o3": "Ælfric's Letter to Wulfstan II",
-    "comargaC.o34": "Saint Margaret (C)",
-    "comargaT": "Saint Margaret (T)",
-    "comart1": "Martyrology, I",
-    "comart2": "Martyrology, II",
-    "comart3.o23": "Martyrology, III",
-    "comarvel.o23": "Marvels of the East",
-    "comary": "Mary of Egypt",
-    "coneot": "Saint Neot",
-    "conicodA": "Gospel of Nicodemus (A)",
-    "conicodC": "Gospel of Nicodemus (C)",
-    "conicodD": "Gospel of Nicodemus (D)",
-    "conicodE": "Gospel of Nicodemus (E)",
-    "coorosiu.o2": "Orosius",
-    "cootest.o3": "Heptateuch",
-    "coprefcath1.o3": "Ælfric's Preface to Catholic Homilies I",
-    "coprefcath2.o3": "Ælfric's Preface to Catholic Homilies II",
-    "coprefcura.o2": "Preface to the Cura Pastoralis",
-    "coprefgen.o3": "Ælfric's Preface to Genesis",
-    "copreflives.o3": "Ælfric's Preface to Lives of Saints",
-    "coprefsolilo": "Preface to Augustine's Soliloquies",
-    "coquadru.o23": "Pseudo-Apuleius, Medicina de quadrupedibus",
-    "corood": "History of the Holy Rood-Tree",
-    "cosevensl": "Seven Sleepers",
-    "cosolilo": "St. Augustine's Soliloquies",
-    "cosolsat1.o4": "Solomon and Saturn I",
-    "cosolsat2": "Solomon and Saturn II",
-    "cotempo.o3": "Ælfric's De Temporibus Anni",
-    "coverhom": "Vercelli Homilies",
-    "coverhomE": "Vercelli Homilies (E)",
-    "coverhomL": "Vercelli Homilies (L)",
-    "covinceB": "Saint Vincent (Bodley 343)",
-    "covinsal": "Vindicta Salvatoris",
-    "cowsgosp.o3": "West-Saxon Gospels",
-    "cowulf.o34": "Wulfstan's Homilies",
+    'coadrian.o34': 'Adrian and Ritheus',
+    'coaelhom.o3': 'Ælfric, Supplemental Homilies',
+    'coaelive.o3': 'Ælfric\'s Lives of Saints',
+    'coalcuin': 'Alcuin De virtutibus et vitiis',
+    'coalex.o23': 'Alexander\'s Letter to Aristotle',
+    'coapollo.o3': 'Apollonius of Tyre',
+    'coaugust': 'Augustine',
+    'cobede.o2': 'Bede\'s History of the English Church',
+    'cobenrul.o3': 'Benedictine Rule',
+    'coblick.o23': 'Blickling Homilies',
+    'coboeth.o2': 'Boethius\' Consolation of Philosophy',
+    'cobyrhtf.o3': 'Byrhtferth\'s Manual',
+    'cocanedgD': 'Canons of Edgar (D)',
+    'cocanedgX': 'Canons of Edgar (X)',
+    'cocathom1.o3': 'Ælfric\'s Catholic Homilies I',
+    'cocathom2.o3': 'Ælfric\'s Catholic Homilies II',
+    'cochad.o24': 'Saint Chad',
+    'cochdrul': 'Chrodegang of Metz, Rule',
+    'cochristoph': 'Saint Christopher',
+    'cochronA.o23': 'Anglo-Saxon Chronicle A',
+    'cochronC': 'Anglo-Saxon Chronicle C',
+    'cochronD': 'Anglo-Saxon Chronicle D',
+    'cochronE.o34': 'Anglo-Saxon Chronicle E',
+    'cocura.o2': 'Cura Pastoralis',
+    'cocuraC': 'Cura Pastoralis (Cotton)',
+    'codicts.o34': 'Dicts of Cato',
+    'codocu1.o1': 'Documents 1 (O1)',
+    'codocu2.o12': 'Documents 2 (O1/O2)',
+    'codocu2.o2': 'Documents 2 (O2)',
+    'codocu3.o23': 'Documents 3 (O2/O3)',
+    'codocu3.o3': 'Documents 3 (O3)',
+    'codocu4.o24': 'Documents 4 (O2/O4)',
+    'coeluc1': 'Honorius of Autun, Elucidarium 1',
+    'coeluc2': 'Honorius of Autun, Elucidarium 1',
+    'coepigen.o3': 'Ælfric\'s Epilogue to Genesis',
+    'coeuphr': 'Saint Euphrosyne',
+    'coeust': 'Saint Eustace and his companions',
+    'coexodusP': 'Exodus (P)',
+    'cogenesiC': 'Genesis (C)',
+    'cogregdC.o24': 'Gregory\'s Dialogues (C)',
+    'cogregdH.o23': 'Gregory\'s Dialogues (H)',
+    'coherbar': 'Pseudo-Apuleius, Herbarium',
+    'coinspolD.o34': 'Wulfstan\'s Institute of Polity (D)',
+    'coinspolX': 'Wulfstan\'s Institute of Polity (X)',
+    'cojames': 'Saint James',
+    'colacnu.o23': 'Lacnunga',
+    'colaece.o2': 'Leechdoms',
+    'colaw1cn.o3': 'Laws, Cnut I',
+    'colaw2cn.o3': 'Laws, Cnut II',
+    'colaw5atr.o3': 'Laws, Æthelred V',
+    'colaw6atr.o3': 'Laws, Æthelred VI',
+    'colawaf.o2': 'Laws, Alfred',
+    'colawafint.o2': 'Alfred\'s Introduction to Laws',
+    'colawger.o34': 'Laws, Gerefa',
+    'colawine.ox2': 'Laws, Ine',
+    'colawnorthu.o3': 'Northumbra Preosta Lagu',
+    'colawwllad.o4': 'Laws, William I, Lad',
+    'coleofri.o4': 'Leofric',
+    'colsigef.o3': 'Ælfric\'s Letter to Sigefyrth',
+    'colsigewB': 'Ælfric\'s Letter to Sigeweard (B)',
+    'colsigewZ.o34': 'Ælfric\'s Letter to Sigeweard (Z)',
+    'colwgeat': 'Ælfric\'s Letter to Wulfgeat',
+    'colwsigeT': 'Ælfric\'s Letter to Wulfsige (T)',
+    'colwsigeXa.o34': 'Ælfric\'s Letter to Wulfsige (Xa)',
+    'colwstan1.o3': 'Ælfric\'s Letter to Wulfstan I',
+    'colwstan2.o3': 'Ælfric\'s Letter to Wulfstan II',
+    'comargaC.o34': 'Saint Margaret (C)',
+    'comargaT': 'Saint Margaret (T)',
+    'comart1': 'Martyrology, I',
+    'comart2': 'Martyrology, II',
+    'comart3.o23': 'Martyrology, III',
+    'comarvel.o23': 'Marvels of the East',
+    'comary': 'Mary of Egypt',
+    'coneot': 'Saint Neot',
+    'conicodA': 'Gospel of Nicodemus (A)',
+    'conicodC': 'Gospel of Nicodemus (C)',
+    'conicodD': 'Gospel of Nicodemus (D)',
+    'conicodE': 'Gospel of Nicodemus (E)',
+    'coorosiu.o2': 'Orosius',
+    'cootest.o3': 'Heptateuch',
+    'coprefcath1.o3': 'Ælfric\'s Preface to Catholic Homilies I',
+    'coprefcath2.o3': 'Ælfric\'s Preface to Catholic Homilies II',
+    'coprefcura.o2': 'Preface to the Cura Pastoralis',
+    'coprefgen.o3': 'Ælfric\'s Preface to Genesis',
+    'copreflives.o3': 'Ælfric\'s Preface to Lives of Saints',
+    'coprefsolilo': 'Preface to Augustine\'s Soliloquies',
+    'coquadru.o23': 'Pseudo-Apuleius, Medicina de quadrupedibus',
+    'corood': 'History of the Holy Rood-Tree',
+    'cosevensl': 'Seven Sleepers',
+    'cosolilo': 'St. Augustine\'s Soliloquies',
+    'cosolsat1.o4': 'Solomon and Saturn I',
+    'cosolsat2': 'Solomon and Saturn II',
+    'cotempo.o3': 'Ælfric\'s De Temporibus Anni',
+    'coverhom': 'Vercelli Homilies',
+    'coverhomE': 'Vercelli Homilies (E)',
+    'coverhomL': 'Vercelli Homilies (L)',
+    'covinceB': 'Saint Vincent (Bodley 343)',
+    'covinsal': 'Vindicta Salvatoris',
+    'cowsgosp.o3': 'West-Saxon Gospels',
+    'cowulf.o34': 'Wulfstan\'s Homilies',
 }
index ecd147e..382edc1 100644 (file)
@@ -1,6 +1,6 @@
 # Natural Language Toolkit: Corpus Reader Utility Functions
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Edward Loper <edloper@gmail.com>
 # URL: <http://nltk.org/>
 # For license information, see LICENSE.TXT
@@ -9,13 +9,16 @@
 # { Lazy Corpus Loader
 ######################################################################
 
+from __future__ import unicode_literals
 import re
 import gc
 import nltk
+from nltk.compat import python_2_unicode_compatible
 
 TRY_ZIPFILE_FIRST = False
 
 
+@python_2_unicode_compatible
 class LazyCorpusLoader(object):
     """
     To see the API documentation for this lazily loaded corpus, first
@@ -54,33 +57,33 @@ class LazyCorpusLoader(object):
         self.__name = self.__name__ = name
         self.__reader_cls = reader_cls
         # If nltk_data_subdir is set explicitly
-        if "nltk_data_subdir" in kwargs:
+        if 'nltk_data_subdir' in kwargs:
             # Use the specified subdirectory path
-            self.subdir = kwargs["nltk_data_subdir"]
+            self.subdir = kwargs['nltk_data_subdir']
             # Pops the `nltk_data_subdir` argument, we don't need it anymore.
-            kwargs.pop("nltk_data_subdir", None)
+            kwargs.pop('nltk_data_subdir', None)
         else:  # Otherwise use 'nltk_data/corpora'
-            self.subdir = "corpora"
+            self.subdir = 'corpora'
         self.__args = args
         self.__kwargs = kwargs
 
     def __load(self):
         # Find the corpus root directory.
-        zip_name = re.sub(r"(([^/]+)(/.*)?)", r"\2.zip/\1/", self.__name)
+        zip_name = re.sub(r'(([^/]+)(/.*)?)', r'\2.zip/\1/', self.__name)
         if TRY_ZIPFILE_FIRST:
             try:
-                root = nltk.data.find("{}/{}".format(self.subdir, zip_name))
+                root = nltk.data.find('{}/{}'.format(self.subdir, zip_name))
             except LookupError as e:
                 try:
-                    root = nltk.data.find("{}/{}".format(self.subdir, self.__name))
+                    root = nltk.data.find('{}/{}'.format(self.subdir, self.__name))
                 except LookupError:
                     raise e
         else:
             try:
-                root = nltk.data.find("{}/{}".format(self.subdir, self.__name))
+                root = nltk.data.find('{}/{}'.format(self.subdir, self.__name))
             except LookupError as e:
                 try:
-                    root = nltk.data.find("{}/{}".format(self.subdir, zip_name))
+                    root = nltk.data.find('{}/{}'.format(self.subdir, zip_name))
                 except LookupError:
                     raise e
 
@@ -114,7 +117,7 @@ class LazyCorpusLoader(object):
         # (see http://bugs.python.org/issue1225107).
         # Without this fix tests may take extra 1.5GB RAM
         # because all corpora gets loaded during test collection.
-        if attr == "__bases__":
+        if attr == '__bases__':
             raise AttributeError("LazyCorpusLoader object has no attribute '__bases__'")
 
         self.__load()
@@ -123,9 +126,9 @@ class LazyCorpusLoader(object):
         return getattr(self, attr)
 
     def __repr__(self):
-        return "<%s in %r (not loaded yet)>" % (
+        return '<%s in %r (not loaded yet)>' % (
             self.__reader_cls.__name__,
-            ".../corpora/" + self.__name,
+            '.../corpora/' + self.__name,
         )
 
     def _unload(self):
index e1cc913..65a0c42 100644 (file)
@@ -1,6 +1,6 @@
 # Natural Language Toolkit: Utility functions
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Edward Loper <edloper@gmail.com>
 # URL: <http://nltk.org/>
 # For license information, see LICENSE.TXT
@@ -30,22 +30,42 @@ resource file, given its URL: ``load()`` loads a given resource, and
 adds it to a resource cache; and ``retrieve()`` copies a given resource
 to a local file.
 """
+from __future__ import print_function, unicode_literals, division
 
 import functools
 import textwrap
 import io
-from io import BytesIO
 import os
 import re
 import sys
 import zipfile
 import codecs
-import pickle
 
 from abc import ABCMeta, abstractmethod
 from gzip import GzipFile, WRITE as GZ_WRITE
 
-from urllib.request import urlopen, url2pathname
+from six import add_metaclass
+from six import string_types, text_type
+from six.moves.urllib.request import urlopen, url2pathname
+
+try:
+    import cPickle as pickle
+except ImportError:
+    import pickle
+
+try:  # Python 3.
+    textwrap_indent = functools.partial(textwrap.indent, prefix='  ')
+except AttributeError:  # Python 2; indent() not available for Python2.
+    textwrap_fill = functools.partial(
+        textwrap.fill,
+        initial_indent='  ',
+        subsequent_indent='  ',
+        replace_whitespace=False,
+    )
+
+    def textwrap_indent(text):
+        return '\n'.join(textwrap_fill(line) for line in text.splitlines())
+
 
 try:
     from zlib import Z_SYNC_FLUSH as FLUSH
@@ -54,10 +74,7 @@ except ImportError:
 
 # this import should be more specific:
 import nltk
-from nltk.compat import py3_data, add_py3_data
-from nltk.internals import deprecated
-
-textwrap_indent = functools.partial(textwrap.indent, prefix="  ")
+from nltk.compat import py3_data, add_py3_data, BytesIO
 
 ######################################################################
 # Search Path
@@ -71,32 +88,32 @@ path = []
    (e.g., in their home directory under ~/nltk_data)."""
 
 # User-specified locations:
-_paths_from_env = os.environ.get("NLTK_DATA", str("")).split(os.pathsep)
+_paths_from_env = os.environ.get('NLTK_DATA', str('')).split(os.pathsep)
 path += [d for d in _paths_from_env if d]
-if "APPENGINE_RUNTIME" not in os.environ and os.path.expanduser("~/") != "~/":
-    path.append(os.path.expanduser(str("~/nltk_data")))
+if 'APPENGINE_RUNTIME' not in os.environ and os.path.expanduser('~/') != '~/':
+    path.append(os.path.expanduser(str('~/nltk_data')))
 
-if sys.platform.startswith("win"):
+if sys.platform.startswith('win'):
     # Common locations on Windows:
     path += [
-        os.path.join(sys.prefix, str("nltk_data")),
-        os.path.join(sys.prefix, str("share"), str("nltk_data")),
-        os.path.join(sys.prefix, str("lib"), str("nltk_data")),
-        os.path.join(os.environ.get(str("APPDATA"), str("C:\\")), str("nltk_data")),
-        str(r"C:\nltk_data"),
-        str(r"D:\nltk_data"),
-        str(r"E:\nltk_data"),
+        os.path.join(sys.prefix, str('nltk_data')),
+        os.path.join(sys.prefix, str('share'), str('nltk_data')),
+        os.path.join(sys.prefix, str('lib'), str('nltk_data')),
+        os.path.join(os.environ.get(str('APPDATA'), str('C:\\')), str('nltk_data')),
+        str(r'C:\nltk_data'),
+        str(r'D:\nltk_data'),
+        str(r'E:\nltk_data'),
     ]
 else:
     # Common locations on UNIX & OS X:
     path += [
-        os.path.join(sys.prefix, str("nltk_data")),
-        os.path.join(sys.prefix, str("share"), str("nltk_data")),
-        os.path.join(sys.prefix, str("lib"), str("nltk_data")),
-        str("/usr/share/nltk_data"),
-        str("/usr/local/share/nltk_data"),
-        str("/usr/lib/nltk_data"),
-        str("/usr/local/lib/nltk_data"),
+        os.path.join(sys.prefix, str('nltk_data')),
+        os.path.join(sys.prefix, str('share'), str('nltk_data')),
+        os.path.join(sys.prefix, str('lib'), str('nltk_data')),
+        str('/usr/share/nltk_data'),
+        str('/usr/local/share/nltk_data'),
+        str('/usr/lib/nltk_data'),
+        str('/usr/local/lib/nltk_data'),
     ]
 
 
@@ -109,7 +126,7 @@ def gzip_open_unicode(
     filename,
     mode="rb",
     compresslevel=9,
-    encoding="utf-8",
+    encoding='utf-8',
     fileobj=None,
     errors=None,
     newline=None,
@@ -135,14 +152,14 @@ def split_resource_url(resource_url):
     >>> split_resource_url('file:///C:/home/nltk')
     ('file', '/C:/home/nltk')
     """
-    protocol, path_ = resource_url.split(":", 1)
-    if protocol == "nltk":
+    protocol, path_ = resource_url.split(':', 1)
+    if protocol == 'nltk':
         pass
-    elif protocol == "file":
-        if path_.startswith("/"):
-            path_ = "/" + path_.lstrip("/")
+    elif protocol == 'file':
+        if path_.startswith('/'):
+            path_ = '/' + path_.lstrip('/')
     else:
-        path_ = re.sub(r"^/{0,2}", "", path_)
+        path_ = re.sub(r'^/{0,2}', '', path_)
     return protocol, path_
 
 
@@ -183,23 +200,23 @@ def normalize_resource_url(resource_url):
         protocol, name = split_resource_url(resource_url)
     except ValueError:
         # the resource url has no protocol, use the nltk protocol by default
-        protocol = "nltk"
+        protocol = 'nltk'
         name = resource_url
     # use file protocol if the path is an absolute path
-    if protocol == "nltk" and os.path.isabs(name):
-        protocol = "file://"
+    if protocol == 'nltk' and os.path.isabs(name):
+        protocol = 'file://'
         name = normalize_resource_name(name, False, None)
-    elif protocol == "file":
-        protocol = "file://"
+    elif protocol == 'file':
+        protocol = 'file://'
         # name is absolute
         name = normalize_resource_name(name, False, None)
-    elif protocol == "nltk":
-        protocol = "nltk:"
+    elif protocol == 'nltk':
+        protocol = 'nltk:'
         name = normalize_resource_name(name, True)
     else:
         # handled by urllib
-        protocol += "://"
-    return "".join([protocol, name])
+        protocol += '://'
+    return ''.join([protocol, name])
 
 
 def normalize_resource_name(resource_name, allow_relative=True, relative_path=None):
@@ -229,24 +246,24 @@ def normalize_resource_name(resource_name, allow_relative=True, relative_path=No
     >>> windows or normalize_resource_name('/dir/file', True, '/') == '/dir/file'
     True
     """
-    is_dir = bool(re.search(r"[\\/.]$", resource_name)) or resource_name.endswith(
+    is_dir = bool(re.search(r'[\\/.]$', resource_name)) or resource_name.endswith(
         os.path.sep
     )
-    if sys.platform.startswith("win"):
-        resource_name = resource_name.lstrip("/")
+    if sys.platform.startswith('win'):
+        resource_name = resource_name.lstrip('/')
     else:
-        resource_name = re.sub(r"^/+", "/", resource_name)
+        resource_name = re.sub(r'^/+', '/', resource_name)
     if allow_relative:
         resource_name = os.path.normpath(resource_name)
     else:
         if relative_path is None:
             relative_path = os.curdir
         resource_name = os.path.abspath(os.path.join(relative_path, resource_name))
-    resource_name = resource_name.replace("\\", "/").replace(os.path.sep, "/")
-    if sys.platform.startswith("win") and os.path.isabs(resource_name):
-        resource_name = "/" + resource_name
-    if is_dir and not resource_name.endswith("/"):
-        resource_name += "/"
+    resource_name = resource_name.replace('\\', '/').replace(os.path.sep, '/')
+    if sys.platform.startswith('win') and os.path.isabs(resource_name):
+        resource_name = '/' + resource_name
+    if is_dir and not resource_name.endswith('/'):
+        resource_name += '/'
     return resource_name
 
 
@@ -255,7 +272,8 @@ def normalize_resource_name(resource_name, allow_relative=True, relative_path=No
 ######################################################################
 
 
-class PathPointer(metaclass=ABCMeta):
+@add_metaclass(ABCMeta)
+class PathPointer(object):
     """
     An abstract base class for 'path pointers,' used by NLTK's data
     package to identify specific paths.  Two subclasses exist:
@@ -296,7 +314,7 @@ class PathPointer(metaclass=ABCMeta):
         """
 
 
-class FileSystemPathPointer(PathPointer, str):
+class FileSystemPathPointer(PathPointer, text_type):
     """
     A path pointer that identifies a file which can be accessed
     directly via a given absolute path.
@@ -312,7 +330,7 @@ class FileSystemPathPointer(PathPointer, str):
 
         _path = os.path.abspath(_path)
         if not os.path.exists(_path):
-            raise IOError("No such file or directory: %r" % _path)
+            raise IOError('No such file or directory: %r' % _path)
         self._path = _path
 
         # There's no need to call str.__init__(), since it's a no-op;
@@ -324,7 +342,7 @@ class FileSystemPathPointer(PathPointer, str):
         return self._path
 
     def open(self, encoding=None):
-        stream = open(self._path, "rb")
+        stream = open(self._path, 'rb')
         if encoding is not None:
             stream = SeekableUnicodeStreamReader(stream, encoding)
         return stream
@@ -337,30 +355,116 @@ class FileSystemPathPointer(PathPointer, str):
         return FileSystemPathPointer(_path)
 
     def __repr__(self):
-        return "FileSystemPathPointer(%r)" % self._path
+        # This should be a byte string under Python 2.x;
+        # we don't want transliteration here so
+        # @python_2_unicode_compatible is not used.
+        return str('FileSystemPathPointer(%r)' % self._path)
 
     def __str__(self):
         return self._path
 
-@deprecated("Use gzip.GzipFile instead as it also uses a buffer.")
+
 class BufferedGzipFile(GzipFile):
-    """A ``GzipFile`` subclass for compatibility with older nltk releases.
+    """
+    A ``GzipFile`` subclass that buffers calls to ``read()`` and ``write()``.
+    This allows faster reads and writes of data to and from gzip-compressed
+    files at the cost of using more memory.
+
+    The default buffer size is 2MB.
 
-    Use ``GzipFile`` directly as it also buffers in all supported
-    Python versions.
+    ``BufferedGzipFile`` is useful for loading large gzipped pickle objects
+    as well as writing large encoded feature files for classifier training.
     """
 
+    MB = 2 ** 20
+    SIZE = 2 * MB
+
     @py3_data
     def __init__(
         self, filename=None, mode=None, compresslevel=9, fileobj=None, **kwargs
     ):
-        """Return a buffered gzip file object."""
+        """
+        Return a buffered gzip file object.
+
+        :param filename: a filesystem path
+        :type filename: str
+        :param mode: a file mode which can be any of 'r', 'rb', 'a', 'ab',
+            'w', or 'wb'
+        :type mode: str
+        :param compresslevel: The compresslevel argument is an integer from 1
+            to 9 controlling the level of compression; 1 is fastest and
+            produces the least compression, and 9 is slowest and produces the
+            most compression. The default is 9.
+        :type compresslevel: int
+        :param fileobj: a BytesIO stream to read from instead of a file.
+        :type fileobj: BytesIO
+        :param size: number of bytes to buffer during calls to read() and write()
+        :type size: int
+        :rtype: BufferedGzipFile
+        """
         GzipFile.__init__(self, filename, mode, compresslevel, fileobj)
+        self._size = kwargs.get('size', self.SIZE)
+        self._nltk_buffer = BytesIO()
+        # cStringIO does not support len.
+        self._len = 0
+
+    def _reset_buffer(self):
+        # For some reason calling BytesIO.truncate() here will lead to
+        # inconsistent writes so just set _buffer to a new BytesIO object.
+        self._nltk_buffer = BytesIO()
+        self._len = 0
+
+    def _write_buffer(self, data):
+        # Simply write to the buffer and increment the buffer size.
+        if data is not None:
+            self._nltk_buffer.write(data)
+            self._len += len(data)
+
+    def _write_gzip(self, data):
+        # Write the current buffer to the GzipFile.
+        GzipFile.write(self, self._nltk_buffer.getvalue())
+        # Then reset the buffer and write the new data to the buffer.
+        self._reset_buffer()
+        self._write_buffer(data)
 
-    def write(self, data):
-        # This is identical to GzipFile.write but does not return
-        # the bytes written to retain compatibility.
-        super().write(data)
+    def close(self):
+        # GzipFile.close() doesn't actuallly close anything.
+        if self.mode == GZ_WRITE:
+            self._write_gzip(None)
+            self._reset_buffer()
+        return GzipFile.close(self)
+
+    def flush(self, lib_mode=FLUSH):
+        self._nltk_buffer.flush()
+        GzipFile.flush(self, lib_mode)
+
+    def read(self, size=None):
+        if not size:
+            size = self._size
+            contents = BytesIO()
+            while True:
+                blocks = GzipFile.read(self, size)
+                if not blocks:
+                    contents.flush()
+                    break
+                contents.write(blocks)
+            return contents.getvalue()
+        else:
+            return GzipFile.read(self, size)
+
+    def write(self, data, size=-1):
+        """
+        :param data: bytes to write to file or buffer
+        :type data: bytes
+        :param size: buffer at least size bytes before writing to file
+        :type size: int
+        """
+        if not size:
+            size = self._size
+        if self._len + len(data) <= size:
+            self._write_buffer(data)
+        else:
+            self._write_gzip(data)
 
 
 class GzipFileSystemPathPointer(FileSystemPathPointer):
@@ -371,7 +475,13 @@ class GzipFileSystemPathPointer(FileSystemPathPointer):
     """
 
     def open(self, encoding=None):
-        stream = GzipFile(self._path, "rb")    
+        # Note: In >= Python3.5, GzipFile is already using a
+        # buffered reader in the backend which has a variable self._buffer
+        # See https://github.com/nltk/nltk/issues/1308
+        if sys.version.startswith('2.7') or sys.version.startswith('3.4'):
+            stream = BufferedGzipFile(self._path, 'rb')
+        else:
+            stream = GzipFile(self._path, 'rb')
         if encoding:
             stream = SeekableUnicodeStreamReader(stream, encoding)
         return stream
@@ -384,7 +494,7 @@ class ZipFilePathPointer(PathPointer):
     """
 
     @py3_data
-    def __init__(self, zipfile, entry=""):
+    def __init__(self, zipfile, entry=''):
         """
         Create a new path pointer pointing at the specified entry
         in the given zipfile.
@@ -392,14 +502,14 @@ class ZipFilePathPointer(PathPointer):
         :raise IOError: If the given zipfile does not exist, or if it
         does not contain the specified entry.
         """
-        if isinstance(zipfile, str):
+        if isinstance(zipfile, string_types):
             zipfile = OpenOnDemandZipFile(os.path.abspath(zipfile))
 
         # Check that the entry exists:
         if entry:
 
             # Normalize the entry string, it should be relative:
-            entry = normalize_resource_name(entry, True, "/").lstrip("/")
+            entry = normalize_resource_name(entry, True, '/').lstrip('/')
 
             try:
                 zipfile.getinfo(entry)
@@ -408,14 +518,14 @@ class ZipFilePathPointer(PathPointer):
                 # the zip file.  So if `entry` is a directory name,
                 # then check if the zipfile contains any files that
                 # are under the given directory.
-                if entry.endswith("/") and [
+                if entry.endswith('/') and [
                     n for n in zipfile.namelist() if n.startswith(entry)
                 ]:
                     pass  # zipfile contains a file in that directory.
                 else:
                     # Otherwise, complain.
                     raise IOError(
-                        "Zipfile %r does not contain %r" % (zipfile.filename, entry)
+                        'Zipfile %r does not contain %r' % (zipfile.filename, entry)
                     )
         self._zipfile = zipfile
         self._entry = entry
@@ -439,8 +549,14 @@ class ZipFilePathPointer(PathPointer):
     def open(self, encoding=None):
         data = self._zipfile.read(self._entry)
         stream = BytesIO(data)
-        if self._entry.endswith(".gz"):
-            stream = GzipFile(self._entry, fileobj=stream)
+        if self._entry.endswith('.gz'):
+            # Note: In >= Python3.5, GzipFile is already using a
+            # buffered reader in the backend which has a variable self._buffer
+            # See https://github.com/nltk/nltk/issues/1308
+            if sys.version.startswith('2.7') or sys.version.startswith('3.4'):
+                stream = BufferedGzipFile(self._entry, fileobj=stream)
+            else:
+                stream = GzipFile(self._entry, fileobj=stream)
         elif encoding is not None:
             stream = SeekableUnicodeStreamReader(stream, encoding)
         return stream
@@ -449,11 +565,11 @@ class ZipFilePathPointer(PathPointer):
         return self._zipfile.getinfo(self._entry).file_size
 
     def join(self, fileid):
-        entry = "%s/%s" % (self._entry, fileid)
+        entry = '%s/%s' % (self._entry, fileid)
         return ZipFilePathPointer(self._zipfile, entry)
 
     def __repr__(self):
-        return str("ZipFilePathPointer(%r, %r)") % (self._zipfile.filename, self._entry)
+        return str('ZipFilePathPointer(%r, %r)') % (self._zipfile.filename, self._entry)
 
     def __str__(self):
         return os.path.normpath(os.path.join(self._zipfile.filename, self._entry))
@@ -515,13 +631,13 @@ def find(resource_name, paths=None):
         paths = path
 
     # Check if the resource name includes a zipfile name
-    m = re.match(r"(.*\.zip)/?(.*)$|", resource_name)
+    m = re.match(r'(.*\.zip)/?(.*)$|', resource_name)
     zipfile, zipentry = m.groups()
 
     # Check each item in our path
     for path_ in paths:
         # Is the path item a zipfile?
-        if path_ and (os.path.isfile(path_) and path_.endswith(".zip")):
+        if path_ and (os.path.isfile(path_) and path_.endswith('.zip')):
             try:
                 return ZipFilePathPointer(path_, resource_name)
             except IOError:
@@ -533,7 +649,7 @@ def find(resource_name, paths=None):
             if zipfile is None:
                 p = os.path.join(path_, url2pathname(resource_name))
                 if os.path.exists(p):
-                    if p.endswith(".gz"):
+                    if p.endswith('.gz'):
                         return GzipFileSystemPathPointer(p)
                     else:
                         return FileSystemPathPointer(p)
@@ -550,38 +666,36 @@ def find(resource_name, paths=None):
     # again, assuming that one of the path components is inside a
     # zipfile of the same name.
     if zipfile is None:
-        pieces = resource_name.split("/")
+        pieces = resource_name.split('/')
         for i in range(len(pieces)):
-            modified_name = "/".join(pieces[:i] + [pieces[i] + ".zip"] + pieces[i:])
+            modified_name = '/'.join(pieces[:i] + [pieces[i] + '.zip'] + pieces[i:])
             try:
                 return find(modified_name, paths)
             except LookupError:
                 pass
 
     # Identify the package (i.e. the .zip file) to download.
-    resource_zipname = resource_name.split("/")[1]
-    if resource_zipname.endswith(".zip"):
-        resource_zipname = resource_zipname.rpartition(".")[0]
+    resource_zipname = resource_name.split('/')[1]
+    if resource_zipname.endswith('.zip'):
+        resource_zipname = resource_zipname.rpartition('.')[0]
     # Display a friendly error message if the resource wasn't found:
     msg = str(
         "Resource \33[93m{resource}\033[0m not found.\n"
         "Please use the NLTK Downloader to obtain the resource:\n\n"
         "\33[31m"  # To display red text in terminal.
         ">>> import nltk\n"
-        ">>> nltk.download('{resource}')\n"
+        ">>> nltk.download(\'{resource}\')\n"
         "\033[0m"
     ).format(resource=resource_zipname)
     msg = textwrap_indent(msg)
 
-    msg += "\n  For more information see: https://www.nltk.org/data.html\n"
-
-    msg += "\n  Attempted to load \33[93m{resource_name}\033[0m\n".format(
+    msg += '\n  Attempted to load \33[93m{resource_name}\033[0m\n'.format(
         resource_name=resource_name
     )
 
-    msg += "\n  Searched in:" + "".join("\n    - %r" % d for d in paths)
-    sep = "*" * 70
-    resource_not_found = "\n%s\n%s\n%s\n" % (sep, msg, sep)
+    msg += '\n  Searched in:' + ''.join('\n    - %r' % d for d in paths)
+    sep = '*' * 70
+    resource_not_found = '\n%s\n%s\n%s\n' % (sep, msg, sep)
     raise LookupError(resource_not_found)
 
 
@@ -598,16 +712,16 @@ def retrieve(resource_url, filename=None, verbose=True):
     """
     resource_url = normalize_resource_url(resource_url)
     if filename is None:
-        if resource_url.startswith("file:"):
+        if resource_url.startswith('file:'):
             filename = os.path.split(resource_url)[-1]
         else:
-            filename = re.sub(r"(^\w+:)?.*/", "", resource_url)
+            filename = re.sub(r'(^\w+:)?.*/', '', resource_url)
     if os.path.exists(filename):
         filename = os.path.abspath(filename)
         raise ValueError("File %r already exists!" % filename)
 
     if verbose:
-        print("Retrieving %r, saving to %r" % (resource_url, filename))
+        print('Retrieving %r, saving to %r' % (resource_url, filename))
 
     # Open the input & output streams.
     infile = _open(resource_url)
@@ -627,43 +741,43 @@ def retrieve(resource_url, filename=None, verbose=True):
 #: load() method.  Keys are format names, and values are format
 #: descriptions.
 FORMATS = {
-    "pickle": "A serialized python object, stored using the pickle module.",
-    "json": "A serialized python object, stored using the json module.",
-    "yaml": "A serialized python object, stored using the yaml module.",
-    "cfg": "A context free grammar.",
-    "pcfg": "A probabilistic CFG.",
-    "fcfg": "A feature CFG.",
-    "fol": "A list of first order logic expressions, parsed with "
+    'pickle': "A serialized python object, stored using the pickle module.",
+    'json': "A serialized python object, stored using the json module.",
+    'yaml': "A serialized python object, stored using the yaml module.",
+    'cfg': "A context free grammar.",
+    'pcfg': "A probabilistic CFG.",
+    'fcfg': "A feature CFG.",
+    'fol': "A list of first order logic expressions, parsed with "
     "nltk.sem.logic.Expression.fromstring.",
-    "logic": "A list of first order logic expressions, parsed with "
+    'logic': "A list of first order logic expressions, parsed with "
     "nltk.sem.logic.LogicParser.  Requires an additional logic_parser "
     "parameter",
-    "val": "A semantic valuation, parsed by nltk.sem.Valuation.fromstring.",
-    "raw": "The raw (byte string) contents of a file.",
-    "text": "The raw (unicode string) contents of a file. ",
+    'val': "A semantic valuation, parsed by nltk.sem.Valuation.fromstring.",
+    'raw': "The raw (byte string) contents of a file.",
+    'text': "The raw (unicode string) contents of a file. ",
 }
 
 #: A dictionary mapping from file extensions to format names, used
 #: by load() when format="auto" to decide the format for a
 #: given resource url.
 AUTO_FORMATS = {
-    "pickle": "pickle",
-    "json": "json",
-    "yaml": "yaml",
-    "cfg": "cfg",
-    "pcfg": "pcfg",
-    "fcfg": "fcfg",
-    "fol": "fol",
-    "logic": "logic",
-    "val": "val",
-    "txt": "text",
-    "text": "text",
+    'pickle': 'pickle',
+    'json': 'json',
+    'yaml': 'yaml',
+    'cfg': 'cfg',
+    'pcfg': 'pcfg',
+    'fcfg': 'fcfg',
+    'fol': 'fol',
+    'logic': 'logic',
+    'val': 'val',
+    'txt': 'text',
+    'text': 'text',
 }
 
 
 def load(
     resource_url,
-    format="auto",
+    format='auto',
     cache=True,
     verbose=False,
     logic_parser=None,
@@ -702,7 +816,9 @@ def load(
     :type cache: bool
     :param cache: If true, add this resource to a cache.  If load()
         finds a resource in its cache, then it will return it from the
-        cache rather than loading it.
+        cache rather than loading it.  The cache uses weak references,
+        so a resource wil automatically be expunged from the cache
+        when no more objects are using it.
     :type verbose: bool
     :param verbose: If true, print a message when loading a resource.
         Messages are not displayed when a resource is retrieved from
@@ -720,42 +836,42 @@ def load(
     resource_url = add_py3_data(resource_url)
 
     # Determine the format of the resource.
-    if format == "auto":
-        resource_url_parts = resource_url.split(".")
+    if format == 'auto':
+        resource_url_parts = resource_url.split('.')
         ext = resource_url_parts[-1]
-        if ext == "gz":
+        if ext == 'gz':
             ext = resource_url_parts[-2]
         format = AUTO_FORMATS.get(ext)
         if format is None:
             raise ValueError(
-                "Could not determine format for %s based "
+                'Could not determine format for %s based '
                 'on its file\nextension; use the "format" '
-                "argument to specify the format explicitly." % resource_url
+                'argument to specify the format explicitly.' % resource_url
             )
 
     if format not in FORMATS:
-        raise ValueError("Unknown format type: %s!" % (format,))
+        raise ValueError('Unknown format type: %s!' % (format,))
 
     # If we've cached the resource, then just return it.
     if cache:
         resource_val = _resource_cache.get((resource_url, format))
         if resource_val is not None:
             if verbose:
-                print("<<Using cached copy of %s>>" % (resource_url,))
+                print('<<Using cached copy of %s>>' % (resource_url,))
             return resource_val
 
     # Let the user know what's going on.
     if verbose:
-        print("<<Loading %s>>" % (resource_url,))
+        print('<<Loading %s>>' % (resource_url,))
 
     # Load the resource.
     opened_resource = _open(resource_url)
 
-    if format == "raw":
+    if format == 'raw':
         resource_val = opened_resource.read()
-    elif format == "pickle":
+    elif format == 'pickle':
         resource_val = pickle.load(opened_resource)
-    elif format == "json":
+    elif format == 'json':
         import json
         from nltk.jsontags import json_tags
 
@@ -764,11 +880,11 @@ def load(
         if len(resource_val) != 1:
             tag = next(resource_val.keys())
         if tag not in json_tags:
-            raise ValueError("Unknown json tag.")
-    elif format == "yaml":
+            raise ValueError('Unknown json tag.')
+    elif format == 'yaml':
         import yaml
 
-        resource_val = yaml.safe_load(opened_resource)
+        resource_val = yaml.load(opened_resource)
     else:
         # The resource is a text format.
         binary_data = opened_resource.read()
@@ -776,33 +892,33 @@ def load(
             string_data = binary_data.decode(encoding)
         else:
             try:
-                string_data = binary_data.decode("utf-8")
+                string_data = binary_data.decode('utf-8')
             except UnicodeDecodeError:
-                string_data = binary_data.decode("latin-1")
-        if format == "text":
+                string_data = binary_data.decode('latin-1')
+        if format == 'text':
             resource_val = string_data
-        elif format == "cfg":
+        elif format == 'cfg':
             resource_val = nltk.grammar.CFG.fromstring(string_data, encoding=encoding)
-        elif format == "pcfg":
+        elif format == 'pcfg':
             resource_val = nltk.grammar.PCFG.fromstring(string_data, encoding=encoding)
-        elif format == "fcfg":
+        elif format == 'fcfg':
             resource_val = nltk.grammar.FeatureGrammar.fromstring(
                 string_data,
                 logic_parser=logic_parser,
                 fstruct_reader=fstruct_reader,
                 encoding=encoding,
             )
-        elif format == "fol":
+        elif format == 'fol':
             resource_val = nltk.sem.read_logic(
                 string_data,
                 logic_parser=nltk.sem.logic.LogicParser(),
                 encoding=encoding,
             )
-        elif format == "logic":
+        elif format == 'logic':
             resource_val = nltk.sem.read_logic(
                 string_data, logic_parser=logic_parser, encoding=encoding
             )
-        elif format == "val":
+        elif format == 'val':
             resource_val = nltk.sem.read_valuation(string_data, encoding=encoding)
         else:
             raise AssertionError(
@@ -826,7 +942,7 @@ def load(
     return resource_val
 
 
-def show_cfg(resource_url, escape="##"):
+def show_cfg(resource_url, escape='##'):
     """
     Write out a grammar file, ignoring escaped and empty lines.
 
@@ -838,12 +954,12 @@ def show_cfg(resource_url, escape="##"):
     :param escape: Prepended string that signals lines to be ignored
     """
     resource_url = normalize_resource_url(resource_url)
-    resource_val = load(resource_url, format="text", cache=False)
+    resource_val = load(resource_url, format='text', cache=False)
     lines = resource_val.splitlines()
     for l in lines:
         if l.startswith(escape):
             continue
-        if re.match("^$", l):
+        if re.match('^$', l):
             continue
         print(l)
 
@@ -873,11 +989,11 @@ def _open(resource_url):
     resource_url = normalize_resource_url(resource_url)
     protocol, path_ = split_resource_url(resource_url)
 
-    if protocol is None or protocol.lower() == "nltk":
-        return find(path_, path + [""]).open()
-    elif protocol.lower() == "file":
+    if protocol is None or protocol.lower() == 'nltk':
+        return find(path_, path + ['']).open()
+    elif protocol.lower() == 'file':
         # urllib might not use mode='rb', so handle this one ourselves:
-        return find(path_, [""]).open()
+        return find(path_, ['']).open()
     else:
         return urlopen(resource_url)
 
@@ -886,6 +1002,9 @@ def _open(resource_url):
 # Lazy Resource Loader
 ######################################################################
 
+# We shouldn't apply @python_2_unicode_compatible
+# decorator to LazyLoader, this is resource.__class__ responsibility.
+
 
 class LazyLoader(object):
     @py3_data
@@ -931,8 +1050,8 @@ class OpenOnDemandZipFile(zipfile.ZipFile):
 
     @py3_data
     def __init__(self, filename):
-        if not isinstance(filename, str):
-            raise TypeError("ReopenableZipFile filename must be a string")
+        if not isinstance(filename, string_types):
+            raise TypeError('ReopenableZipFile filename must be a string')
         zipfile.ZipFile.__init__(self, filename)
         assert self.filename == filename
         self.close()
@@ -942,7 +1061,7 @@ class OpenOnDemandZipFile(zipfile.ZipFile):
 
     def read(self, name):
         assert self.fp is None
-        self.fp = open(self.filename, "rb")
+        self.fp = open(self.filename, 'rb')
         value = zipfile.ZipFile.read(self, name)
         # Ensure that _fileRefCnt needs to be set for Python2and3 compatible code.
         # Since we only opened one file here, we add 1.
@@ -952,14 +1071,14 @@ class OpenOnDemandZipFile(zipfile.ZipFile):
 
     def write(self, *args, **kwargs):
         """:raise NotImplementedError: OpenOnDemandZipfile is read-only"""
-        raise NotImplementedError("OpenOnDemandZipfile is read-only")
+        raise NotImplementedError('OpenOnDemandZipfile is read-only')
 
     def writestr(self, *args, **kwargs):
         """:raise NotImplementedError: OpenOnDemandZipfile is read-only"""
-        raise NotImplementedError("OpenOnDemandZipfile is read-only")
+        raise NotImplementedError('OpenOnDemandZipfile is read-only')
 
     def __repr__(self):
-        return repr(str("OpenOnDemandZipFile(%r)") % self.filename)
+        return repr(str('OpenOnDemandZipFile(%r)') % self.filename)
 
 
 ######################################################################
@@ -987,7 +1106,7 @@ class SeekableUnicodeStreamReader(object):
     DEBUG = True  # : If true, then perform extra sanity checks.
 
     @py3_data
-    def __init__(self, stream, encoding, errors="strict"):
+    def __init__(self, stream, encoding, errors='strict'):
         # Rewind the stream to its beginning.
         stream.seek(0)
 
@@ -1007,7 +1126,7 @@ class SeekableUnicodeStreamReader(object):
         """The function that is used to decode byte strings into
            unicode strings."""
 
-        self.bytebuffer = b""
+        self.bytebuffer = b''
         """A buffer to use bytes that have been read but have not yet
            been decoded.  This is only used when the final bytes from
            a read do not form a complete encoding for a character."""
@@ -1057,7 +1176,7 @@ class SeekableUnicodeStreamReader(object):
 
         # If linebuffer is not empty, then include it in the result
         if self.linebuffer:
-            chars = "".join(self.linebuffer) + chars
+            chars = ''.join(self.linebuffer) + chars
             self.linebuffer = None
             self._rewind_numchars = None
 
@@ -1089,7 +1208,7 @@ class SeekableUnicodeStreamReader(object):
             return line
 
         readsize = size or 72
-        chars = ""
+        chars = ''
 
         # If there's a remaining incomplete line in the buffer, add it.
         if self.linebuffer:
@@ -1102,7 +1221,7 @@ class SeekableUnicodeStreamReader(object):
 
             # If we're at a '\r', then read one extra character, since
             # it might be a '\n', to get the proper line ending.
-            if new_chars and new_chars.endswith("\r"):
+            if new_chars and new_chars.endswith('\r'):
                 new_chars += self._read(1)
 
             chars += new_chars
@@ -1208,13 +1327,13 @@ class SeekableUnicodeStreamReader(object):
         """
         if whence == 1:
             raise ValueError(
-                "Relative seek is not supported for "
-                "SeekableUnicodeStreamReader -- consider "
-                "using char_seek_forward() instead."
+                'Relative seek is not supported for '
+                'SeekableUnicodeStreamReader -- consider '
+                'using char_seek_forward() instead.'
             )
         self.stream.seek(offset, whence)
         self.linebuffer = None
-        self.bytebuffer = b""
+        self.bytebuffer = b''
         self._rewind_numchars = None
         self._rewind_checkpoint = self.stream.tell()
 
@@ -1223,7 +1342,7 @@ class SeekableUnicodeStreamReader(object):
         Move the read pointer forward by ``offset`` characters.
         """
         if offset < 0:
-            raise ValueError("Negative offsets are not supported")
+            raise ValueError('Negative offsets are not supported')
         # Clear all buffers.
         self.seek(self.tell())
         # Perform the seek operation.
@@ -1240,7 +1359,7 @@ class SeekableUnicodeStreamReader(object):
         """
         if est_bytes is None:
             est_bytes = offset
-        bytes = b""
+        bytes = b''
 
         while True:
             # Read in a block of bytes.
@@ -1301,7 +1420,7 @@ class SeekableUnicodeStreamReader(object):
         if self.DEBUG:
             self.stream.seek(filepos)
             check1 = self._incr_decode(self.stream.read(50))[0]
-            check2 = "".join(self.linebuffer)
+            check2 = ''.join(self.linebuffer)
             assert check1.startswith(check2) or check2.startswith(check1)
 
         # Return to our original filepos (so we don't have to throw
@@ -1322,7 +1441,7 @@ class SeekableUnicodeStreamReader(object):
         unicode string.  ``linebuffer`` is not included in the result.
         """
         if size == 0:
-            return ""
+            return ''
 
         # Skip past the byte order marker, if present.
         if self._bom and self.stream.tell() == 0:
@@ -1367,7 +1486,7 @@ class SeekableUnicodeStreamReader(object):
         """
         while True:
             try:
-                return self.decode(bytes, "strict")
+                return self.decode(bytes, 'strict')
             except UnicodeDecodeError as exc:
                 # If the exception occurs at the end of the string,
                 # then assume that it's a truncation error.
@@ -1375,7 +1494,7 @@ class SeekableUnicodeStreamReader(object):
                     return self.decode(bytes[: exc.start], self.errors)
 
                 # Otherwise, if we're being strict, then raise it.
-                elif self.errors == "strict":
+                elif self.errors == 'strict':
                     raise
 
                 # If we're not strict, then re-process it with our
@@ -1384,18 +1503,18 @@ class SeekableUnicodeStreamReader(object):
                     return self.decode(bytes, self.errors)
 
     _BOM_TABLE = {
-        "utf8": [(codecs.BOM_UTF8, None)],
-        "utf16": [(codecs.BOM_UTF16_LE, "utf16-le"), (codecs.BOM_UTF16_BE, "utf16-be")],
-        "utf16le": [(codecs.BOM_UTF16_LE, None)],
-        "utf16be": [(codecs.BOM_UTF16_BE, None)],
-        "utf32": [(codecs.BOM_UTF32_LE, "utf32-le"), (codecs.BOM_UTF32_BE, "utf32-be")],
-        "utf32le": [(codecs.BOM_UTF32_LE, None)],
-        "utf32be": [(codecs.BOM_UTF32_BE, None)],
+        'utf8': [(codecs.BOM_UTF8, None)],
+        'utf16': [(codecs.BOM_UTF16_LE, 'utf16-le'), (codecs.BOM_UTF16_BE, 'utf16-be')],
+        'utf16le': [(codecs.BOM_UTF16_LE, None)],
+        'utf16be': [(codecs.BOM_UTF16_BE, None)],
+        'utf32': [(codecs.BOM_UTF32_LE, 'utf32-le'), (codecs.BOM_UTF32_BE, 'utf32-be')],
+        'utf32le': [(codecs.BOM_UTF32_LE, None)],
+        'utf32be': [(codecs.BOM_UTF32_BE, None)],
     }
 
     def _check_bom(self):
         # Normalize our encoding name
-        enc = re.sub("[ -]", "", self.encoding.lower())
+        enc = re.sub('[ -]', '', self.encoding.lower())
 
         # Look up our encoding in the BOM table.
         bom_info = self._BOM_TABLE.get(enc)
@@ -1416,21 +1535,21 @@ class SeekableUnicodeStreamReader(object):
 
 
 __all__ = [
-    "path",
-    "PathPointer",
-    "FileSystemPathPointer",
-    "BufferedGzipFile",
-    "GzipFileSystemPathPointer",
-    "GzipFileSystemPathPointer",
-    "find",
-    "retrieve",
-    "FORMATS",
-    "AUTO_FORMATS",
-    "load",
-    "show_cfg",
-    "clear_cache",
-    "LazyLoader",
-    "OpenOnDemandZipFile",
-    "GzipFileSystemPathPointer",
-    "SeekableUnicodeStreamReader",
+    'path',
+    'PathPointer',
+    'FileSystemPathPointer',
+    'BufferedGzipFile',
+    'GzipFileSystemPathPointer',
+    'GzipFileSystemPathPointer',
+    'find',
+    'retrieve',
+    'FORMATS',
+    'AUTO_FORMATS',
+    'load',
+    'show_cfg',
+    'clear_cache',
+    'LazyLoader',
+    'OpenOnDemandZipFile',
+    'GzipFileSystemPathPointer',
+    'SeekableUnicodeStreamReader',
 ]
index 8ab4f7d..b61db66 100644 (file)
@@ -5,8 +5,9 @@ http://www.phyast.pitt.edu/~micheles/python/documentation.html
 
 Included in NLTK for its support of a nice memoization decorator.
 """
+from __future__ import print_function
 
-__docformat__ = "restructuredtext en"
+__docformat__ = 'restructuredtext en'
 
 ## The basic trick is to generate the source code for the decorated function
 ## with the right signature and to evaluate it.
@@ -19,25 +20,11 @@ import sys
 
 # Hack to keep NLTK's "tokenize" module from colliding with the "tokenize" in
 # the Python standard library.
-OLD_SYS_PATH = sys.path[:]
+old_sys_path = sys.path[:]
 sys.path = [p for p in sys.path if p and "nltk" not in p]
 import inspect
 
-sys.path = OLD_SYS_PATH
-
-def __legacysignature(signature):
-    """
-    For retrocompatibility reasons, we don't use a standard Signature.
-    Instead, we use the string generated by this method.
-    Basically, from a Signature we create a string and remove the default values.
-    """
-    listsignature = str(signature)[1:-1].split(",")
-    for counter, param in enumerate(listsignature):
-        if param.count("=") > 0:
-            listsignature[counter] = param[0:param.index("=")].strip()
-        else:
-            listsignature[counter] = param.strip()
-    return ", ".join(listsignature)
+sys.path = old_sys_path
 
 
 def getinfo(func):
@@ -47,7 +34,6 @@ def getinfo(func):
     - argnames (the names of the arguments : list)
     - defaults (the values of the default arguments : tuple)
     - signature (the signature : str)
-    - fullsignature (the full signature : Signature)
     - doc (the docstring : str)
     - module (the module name : str)
     - dict (the function __dict__ : str)
@@ -66,25 +52,24 @@ def getinfo(func):
 
     >>> info["signature"]
     'self, x, y, *args, **kw'
-
-    >>> info["fullsignature"]
-    <Signature (self, x=1, y=2, *args, **kw)>
     """
     assert inspect.ismethod(func) or inspect.isfunction(func)
-    argspec = inspect.getfullargspec(func)
-    regargs, varargs, varkwargs = argspec[:3]
+    if sys.version_info[0] >= 3:
+        argspec = inspect.getfullargspec(func)
+    else:
+        argspec = inspect.getargspec(func)
+    regargs, varargs, varkwargs, defaults = argspec[:4]
     argnames = list(regargs)
     if varargs:
         argnames.append(varargs)
     if varkwargs:
         argnames.append(varkwargs)
-    fullsignature = inspect.signature(func)
-    # Convert Signature to str
-    signature = __legacysignature(fullsignature)
-
+    signature = inspect.formatargspec(
+        regargs, varargs, varkwargs, defaults, formatvalue=lambda value: ""
+    )[1:-1]
 
     # pypy compatibility
-    if hasattr(func, "__closure__"):
+    if hasattr(func, '__closure__'):
         _closure = func.__closure__
         _globals = func.__globals__
     else:
@@ -95,7 +80,6 @@ def getinfo(func):
         name=func.__name__,
         argnames=argnames,
         signature=signature,
-        fullsignature=fullsignature,
         defaults=func.__defaults__,
         doc=func.__doc__,
         module=func.__module__,
@@ -105,14 +89,14 @@ def getinfo(func):
     )
 
 
+# akin to functools.update_wrapper
 def update_wrapper(wrapper, model, infodict=None):
-    " akin to functools.update_wrapper "
     infodict = infodict or getinfo(model)
-    wrapper.__name__ = infodict["name"]
-    wrapper.__doc__ = infodict["doc"]
-    wrapper.__module__ = infodict["module"]
-    wrapper.__dict__.update(infodict["dict"])
-    wrapper.__defaults__ = infodict["defaults"]
+    wrapper.__name__ = infodict['name']
+    wrapper.__doc__ = infodict['doc']
+    wrapper.__module__ = infodict['module']
+    wrapper.__dict__.update(infodict['dict'])
+    wrapper.__defaults__ = infodict['defaults']
     wrapper.undecorated = model
     return wrapper
 
@@ -130,7 +114,7 @@ def new_wrapper(wrapper, model):
     else:  # assume model is a function
         infodict = getinfo(model)
     assert (
-        not "_wrapper_" in infodict["argnames"]
+        not '_wrapper_' in infodict["argnames"]
     ), '"_wrapper_" is a reserved argument name!'
     src = "lambda %(signature)s: _wrapper_(%(signature)s)" % infodict
     funcopy = eval(src, dict(_wrapper_=wrapper))
@@ -150,12 +134,12 @@ def decorator_factory(cls):
     method.
     """
     attrs = set(dir(cls))
-    if "__call__" in attrs:
+    if '__call__' in attrs:
         raise TypeError(
-            "You cannot decorate a class with a nontrivial " "__call__ method"
+            'You cannot decorate a class with a nontrivial ' '__call__ method'
         )
-    if "call" not in attrs:
-        raise TypeError("You cannot decorate a class without a " ".call method")
+    if 'call' not in attrs:
+        raise TypeError('You cannot decorate a class without a ' '.call method')
     cls.__call__ = __call__
     return cls
 
@@ -195,10 +179,10 @@ def decorator(caller):
 
     def _decorator(func):  # the real meat is here
         infodict = getinfo(func)
-        argnames = infodict["argnames"]
+        argnames = infodict['argnames']
         assert not (
-            "_call_" in argnames or "_func_" in argnames
-        ), "You cannot use _call_ or _func_ as argument names!"
+            '_call_' in argnames or '_func_' in argnames
+        ), 'You cannot use _call_ or _func_ as argument names!'
         src = "lambda %(signature)s: _call_(_func_, %(signature)s)" % infodict
         # import sys; print >> sys.stderr, src # for debugging purposes
         dec_func = eval(src, dict(_func_=func, _call_=caller))
@@ -223,9 +207,10 @@ def memoize(func, *args):
     # memoize_dic is created at the first call
     if args in dic:
         return dic[args]
-    result = func(*args)
-    dic[args] = result
-    return result
+    else:
+        result = func(*args)
+        dic[args] = result
+        return result
 
 
 ##########################     LEGALESE    ###############################
index 097e574..8874e7c 100644 (file)
@@ -1,6 +1,6 @@
 # Natural Language Toolkit: Corpus & Model Downloader
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Edward Loper <edloper@gmail.com>
 # URL: <http://nltk.org/>
 # For license information, see LICENSE.TXT
@@ -67,6 +67,7 @@ or::
     python -m nltk.downloader [-d DATADIR] [-q] [-f] [-k] PACKAGE_IDS
 """
 # ----------------------------------------------------------------------
+from __future__ import print_function, division, unicode_literals
 
 """
 
@@ -165,7 +166,7 @@ from xml.etree import ElementTree
 
 try:
     TKINTER = True
-    from tkinter import (
+    from six.moves.tkinter import (
         Tk,
         Frame,
         Label,
@@ -176,17 +177,20 @@ try:
         IntVar,
         TclError,
     )
-    from tkinter.messagebox import showerror
+    from six.moves.tkinter_messagebox import showerror
     from nltk.draw.table import Table
     from nltk.draw.util import ShowText
 except ImportError:
     TKINTER = False
     TclError = ValueError
 
-from urllib.request import urlopen
-from urllib.error import HTTPError, URLError
+from six import string_types, text_type
+from six.moves import input
+from six.moves.urllib.request import urlopen
+from six.moves.urllib.error import HTTPError, URLError
 
 import nltk
+from nltk.compat import python_2_unicode_compatible
 
 # urllib2 = nltk.internals.import_from_stdlib('urllib2')
 
@@ -195,6 +199,8 @@ import nltk
 # Directory entry objects (from the data server's index file)
 ######################################################################
 
+
+@python_2_unicode_compatible
 class Package(object):
     """
     A directory entry for a downloadable package.  These entries are
@@ -209,15 +215,15 @@ class Package(object):
         id,
         url,
         name=None,
-        subdir="",
+        subdir='',
         size=None,
         unzipped_size=None,
         checksum=None,
         svn_revision=None,
-        copyright="Unknown",
-        contact="Unknown",
-        license="Unknown",
-        author="Unknown",
+        copyright='Unknown',
+        contact='Unknown',
+        license='Unknown',
+        author='Unknown',
         unzip=True,
         **kw
     ):
@@ -260,7 +266,7 @@ class Package(object):
         self.author = author
         """Author of this package."""
 
-        ext = os.path.splitext(url.split("/")[-1])[1]
+        ext = os.path.splitext(url.split('/')[-1])[1]
         self.filename = os.path.join(subdir, id + ext)
         """The filename that should be used for this package's file.  It
            is formed by joining ``self.subdir`` with ``self.id``, and
@@ -275,19 +281,20 @@ class Package(object):
 
     @staticmethod
     def fromxml(xml):
-        if isinstance(xml, str):
+        if isinstance(xml, string_types):
             xml = ElementTree.parse(xml)
         for key in xml.attrib:
-            xml.attrib[key] = str(xml.attrib[key])
+            xml.attrib[key] = text_type(xml.attrib[key])
         return Package(**xml.attrib)
 
     def __lt__(self, other):
         return self.id < other.id
 
     def __repr__(self):
-        return "<Package %s>" % self.id
+        return '<Package %s>' % self.id
 
 
+@python_2_unicode_compatible
 class Collection(object):
     """
     A directory entry for a collection of downloadable packages.
@@ -315,18 +322,18 @@ class Collection(object):
 
     @staticmethod
     def fromxml(xml):
-        if isinstance(xml, str):
+        if isinstance(xml, string_types):
             xml = ElementTree.parse(xml)
         for key in xml.attrib:
-            xml.attrib[key] = str(xml.attrib[key])
-        children = [child.get("ref") for child in xml.findall("item")]
+            xml.attrib[key] = text_type(xml.attrib[key])
+        children = [child.get('ref') for child in xml.findall('item')]
         return Collection(children=children, **xml.attrib)
 
     def __lt__(self, other):
         return self.id < other.id
 
     def __repr__(self):
-        return "<Collection %s>" % self.id
+        return '<Collection %s>' % self.id
 
 
 ######################################################################
@@ -454,7 +461,7 @@ class Downloader(object):
        server index will be considered 'stale,' and will be
        re-downloaded."""
 
-    DEFAULT_URL = "https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml"
+    DEFAULT_URL = 'https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml'
     """The default URL for the NLTK data server's index.  An
        alternative URL can be specified when creating a new
        ``Downloader`` object."""
@@ -463,16 +470,16 @@ class Downloader(object):
     # Status Constants
     # /////////////////////////////////////////////////////////////////
 
-    INSTALLED = "installed"
+    INSTALLED = 'installed'
     """A status string indicating that a package or collection is
        installed and up-to-date."""
-    NOT_INSTALLED = "not installed"
+    NOT_INSTALLED = 'not installed'
     """A status string indicating that a package or collection is
        not installed."""
-    STALE = "out of date"
+    STALE = 'out of date'
     """A status string indicating that a package or collection is
        corrupt or out-of-date."""
-    PARTIAL = "partial"
+    PARTIAL = 'partial'
     """A status string indicating that a collection is partially
        installed (i.e., only some of its packages are installed.)"""
 
@@ -529,21 +536,21 @@ class Downloader(object):
         lines = 0  # for more_prompt
         if download_dir is None:
             download_dir = self._download_dir
-            print("Using default data directory (%s)" % download_dir)
+            print('Using default data directory (%s)' % download_dir)
         if header:
-            print("=" * (26 + len(self._url)))
-            print(" Data server index for <%s>" % self._url)
-            print("=" * (26 + len(self._url)))
+            print('=' * (26 + len(self._url)))
+            print(' Data server index for <%s>' % self._url)
+            print('=' * (26 + len(self._url)))
             lines += 3  # for more_prompt
         stale = partial = False
 
         categories = []
         if show_packages:
-            categories.append("packages")
+            categories.append('packages')
         if show_collections:
-            categories.append("collections")
+            categories.append('collections')
         for category in categories:
-            print("%s:" % category.capitalize())
+            print('%s:' % category.capitalize())
             lines += 1  # for more_prompt
             for info in sorted(getattr(self, category)(), key=str):
                 status = self.status(info, download_dir)
@@ -554,28 +561,28 @@ class Downloader(object):
                 if status == self.PARTIAL:
                     partial = True
                 prefix = {
-                    self.INSTALLED: "*",
-                    self.STALE: "-",
-                    self.PARTIAL: "P",
-                    self.NOT_INSTALLED: " ",
+                    self.INSTALLED: '*',
+                    self.STALE: '-',
+                    self.PARTIAL: 'P',
+                    self.NOT_INSTALLED: ' ',
                 }[status]
                 name = textwrap.fill(
-                    "-" * 27 + (info.name or info.id), 75, subsequent_indent=27 * " "
+                    '-' * 27 + (info.name or info.id), 75, subsequent_indent=27 * ' '
                 )[27:]
-                print("  [%s] %s %s" % (prefix, info.id.ljust(20, "."), name))
-                lines += len(name.split("\n"))  # for more_prompt
+                print('  [%s] %s %s' % (prefix, info.id.ljust(20, '.'), name))
+                lines += len(name.split('\n'))  # for more_prompt
                 if more_prompt and lines > 20:
                     user_input = input("Hit Enter to continue: ")
-                    if user_input.lower() in ("x", "q"):
+                    if user_input.lower() in ('x', 'q'):
                         return
                     lines = 0
             print()
-        msg = "([*] marks installed packages"
+        msg = '([*] marks installed packages'
         if stale:
-            msg += "; [-] marks out-of-date or corrupt packages"
+            msg += '; [-] marks out-of-date or corrupt packages'
         if partial:
-            msg += "; [P] marks partially installed collections"
-        print(textwrap.fill(msg + ")", subsequent_indent=" ", width=76))
+            msg += '; [P] marks partially installed collections'
+        print(textwrap.fill(msg + ')', subsequent_indent=' ', width=76))
 
     def packages(self):
         self._update_index()
@@ -583,11 +590,11 @@ class Downloader(object):
 
     def corpora(self):
         self._update_index()
-        return [pkg for (id, pkg) in self._packages.items() if pkg.subdir == "corpora"]
+        return [pkg for (id, pkg) in self._packages.items() if pkg.subdir == 'corpora']
 
     def models(self):
         self._update_index()
-        return [pkg for (id, pkg) in self._packages.items() if pkg.subdir != "corpora"]
+        return [pkg for (id, pkg) in self._packages.items() if pkg.subdir != 'corpora']
 
     def collections(self):
         self._update_index()
@@ -598,7 +605,7 @@ class Downloader(object):
     # /////////////////////////////////////////////////////////////////
 
     def _info_or_id(self, info_or_id):
-        if isinstance(info_or_id, str):
+        if isinstance(info_or_id, string_types):
             return self.info(info_or_id)
         else:
             return info_or_id
@@ -628,7 +635,7 @@ class Downloader(object):
         try:
             info = self._info_or_id(info_or_id)
         except (IOError, ValueError) as e:
-            yield ErrorMessage(None, "Error loading %s: %s" % (info_or_id, e))
+            yield ErrorMessage(None, 'Error loading %s: %s' % (info_or_id, e))
             return
 
         # Handle collections.
@@ -708,7 +715,8 @@ class Downloader(object):
         yield ProgressMessage(5)
         try:
             infile = urlopen(info.url)
-            with open(filepath, "wb") as outfile:
+            with open(filepath, 'wb') as outfile:
+                # print info.size
                 num_blocks = max(1, info.size / (1024 * 16))
                 for block in itertools.count():
                     s = infile.read(1024 * 16)  # 16k blocks.
@@ -721,14 +729,14 @@ class Downloader(object):
         except IOError as e:
             yield ErrorMessage(
                 info,
-                "Error downloading %r from <%s>:" "\n  %s" % (info.id, info.url, e),
+                'Error downloading %r from <%s>:' '\n  %s' % (info.id, info.url, e),
             )
             return
         yield FinishDownloadMessage(info)
         yield ProgressMessage(80)
 
         # If it's a zipfile, uncompress it.
-        if info.filename.endswith(".zip"):
+        if info.filename.endswith('.zip'):
             zipdir = os.path.join(download_dir, info.subdir)
             # Unzip if we're unzipping by default; *or* if it's already
             # been unzipped (presumably a previous version).
@@ -748,7 +756,7 @@ class Downloader(object):
         download_dir=None,
         quiet=False,
         force=False,
-        prefix="[nltk_data] ",
+        prefix='[nltk_data] ',
         halt_on_error=True,
         raise_on_error=False,
         print_error_to=sys.stderr,
@@ -767,12 +775,12 @@ class Downloader(object):
 
         else:
             # Define a helper function for displaying output:
-            def show(s, prefix2=""):
+            def show(s, prefix2=''):
                 print_to(
                     textwrap.fill(
                         s,
                         initial_indent=prefix + prefix2,
-                        subsequent_indent=prefix + prefix2 + " " * 4,
+                        subsequent_indent=prefix + prefix2 + ' ' * 4,
                     )
                 )
 
@@ -788,7 +796,7 @@ class Downloader(object):
                     if not quiet:
                         print_to("Error installing package. Retry? [n/y/e]")
                         choice = input().strip()
-                        if choice in ["y", "Y"]:
+                        if choice in ['y', 'Y']:
                             if not self.download(
                                 msg.package.id,
                                 download_dir,
@@ -799,40 +807,40 @@ class Downloader(object):
                                 raise_on_error,
                             ):
                                 return False
-                        elif choice in ["e", "E"]:
+                        elif choice in ['e', 'E']:
                             return False
 
                 # All other messages
                 if not quiet:
                     # Collection downloading messages:
                     if isinstance(msg, StartCollectionMessage):
-                        show("Downloading collection %r" % msg.collection.id)
-                        prefix += "   | "
+                        show('Downloading collection %r' % msg.collection.id)
+                        prefix += '   | '
                         print_to(prefix)
                     elif isinstance(msg, FinishCollectionMessage):
                         print_to(prefix)
                         prefix = prefix[:-4]
                         if self._errors:
                             show(
-                                "Downloaded collection %r with errors"
+                                'Downloaded collection %r with errors'
                                 % msg.collection.id
                             )
                         else:
-                            show("Done downloading collection %s" % msg.collection.id)
+                            show('Done downloading collection %s' % msg.collection.id)
 
                     # Package downloading messages:
                     elif isinstance(msg, StartPackageMessage):
                         show(
-                            "Downloading package %s to %s..."
+                            'Downloading package %s to %s...'
                             % (msg.package.id, download_dir)
                         )
                     elif isinstance(msg, UpToDateMessage):
-                        show("Package %s is already up-to-date!" % msg.package.id, "  ")
+                        show('Package %s is already up-to-date!' % msg.package.id, '  ')
                     # elif isinstance(msg, StaleMessage):
                     #    show('Package %s is out-of-date or corrupt' %
                     #         msg.package.id, '  ')
                     elif isinstance(msg, StartUnzipMessage):
-                        show("Unzipping %s." % msg.package.filename, "  ")
+                        show('Unzipping %s.' % msg.package.filename, '  ')
 
                     # Data directory message:
                     elif isinstance(msg, SelectDownloadDirMessage):
@@ -903,7 +911,7 @@ class Downloader(object):
 
         # If it's a zipfile, and it's been at least partially
         # unzipped, then check if it's been fully unzipped.
-        if filepath.endswith(".zip"):
+        if filepath.endswith('.zip'):
             unzipdir = filepath[:-4]
             if not os.path.exists(unzipdir):
                 return self.INSTALLED  # but not unzipped -- ok!
@@ -921,7 +929,7 @@ class Downloader(object):
         # Otherwise, everything looks good.
         return self.INSTALLED
 
-    def update(self, quiet=False, prefix="[nltk_data] "):
+    def update(self, quiet=False, prefix='[nltk_data] '):
         """
         Re-download any packages whose status is STALE.
         """
@@ -956,12 +964,12 @@ class Downloader(object):
         self._index_timestamp = time.time()
 
         # Build a dictionary of packages.
-        packages = [Package.fromxml(p) for p in self._index.findall("packages/package")]
+        packages = [Package.fromxml(p) for p in self._index.findall('packages/package')]
         self._packages = dict((p.id, p) for p in packages)
 
         # Build a dictionary of collections.
         collections = [
-            Collection.fromxml(c) for c in self._index.findall("collections/collection")
+            Collection.fromxml(c) for c in self._index.findall('collections/collection')
         ]
         self._collections = dict((c.id, c) for c in collections)
 
@@ -974,7 +982,7 @@ class Downloader(object):
                     collection.children[i] = self._collections[child_id]
                 else:
                     print(
-                        "removing collection member with no package: {}".format(
+                        'removing collection member with no package: {}'.format(
                             child_id
                         )
                     )
@@ -1013,18 +1021,18 @@ class Downloader(object):
             return self._packages[id]
         if id in self._collections:
             return self._collections[id]
-        raise ValueError("Package %r not found in index" % id)
+        raise ValueError('Package %r not found in index' % id)
 
     def xmlinfo(self, id):
         """Return the XML info record for the given item"""
         self._update_index()
-        for package in self._index.findall("packages/package"):
-            if package.get("id") == id:
+        for package in self._index.findall('packages/package'):
+            if package.get('id') == id:
                 return package
-        for collection in self._index.findall("collections/collection"):
-            if collection.get("id") == id:
+        for collection in self._index.findall('collections/collection'):
+            if collection.get('id') == id:
                 return collection
-        raise ValueError("Package %r not found in index" % id)
+        raise ValueError('Package %r not found in index' % id)
 
     # /////////////////////////////////////////////////////////////////
     # URL & Data Directory
@@ -1065,7 +1073,7 @@ class Downloader(object):
         ``/usr/lib/nltk_data``, ``/usr/local/lib/nltk_data``, ``~/nltk_data``.
         """
         # Check if we are on GAE where we cannot write into filesystem.
-        if "APPENGINE_RUNTIME" in os.environ:
+        if 'APPENGINE_RUNTIME' in os.environ:
             return
 
         # Check if we have sufficient permissions to install in a
@@ -1075,17 +1083,17 @@ class Downloader(object):
                 return nltkdir
 
         # On Windows, use %APPDATA%
-        if sys.platform == "win32" and "APPDATA" in os.environ:
-            homedir = os.environ["APPDATA"]
+        if sys.platform == 'win32' and 'APPDATA' in os.environ:
+            homedir = os.environ['APPDATA']
 
         # Otherwise, install in the user's home directory.
         else:
-            homedir = os.path.expanduser("~/")
-            if homedir == "~/":
+            homedir = os.path.expanduser('~/')
+            if homedir == '~/':
                 raise ValueError("Could not find a default download directory")
 
         # append "nltk_data" to the home directory
-        return os.path.join(homedir, "nltk_data")
+        return os.path.join(homedir, 'nltk_data')
 
     def _get_download_dir(self):
         """
@@ -1124,48 +1132,51 @@ class DownloaderShell(object):
         self._ds = dataserver
 
     def _simple_interactive_menu(self, *options):
-        print("-" * 75)
-        spc = (68 - sum(len(o) for o in options)) // (len(options) - 1) * " "
-        print("    " + spc.join(options))
-        print("-" * 75)
+        print('-' * 75)
+        spc = (68 - sum(len(o) for o in options)) // (len(options) - 1) * ' '
+        print('    ' + spc.join(options))
+        # w = 76/len(options)
+        # fmt = '  ' + ('%-'+str(w)+'s')*(len(options)-1) + '%s'
+        # print fmt % options
+        print('-' * 75)
 
     def run(self):
-        print("NLTK Downloader")
+        print('NLTK Downloader')
         while True:
             self._simple_interactive_menu(
-                "d) Download",
-                "l) List",
-                " u) Update",
-                "c) Config",
-                "h) Help",
-                "q) Quit",
+                'd) Download',
+                'l) List',
+                ' u) Update',
+                'c) Config',
+                'h) Help',
+                'q) Quit',
             )
-            user_input = input("Downloader> ").strip()
+            user_input = input('Downloader> ').strip()
             if not user_input:
                 print()
                 continue
             command = user_input.lower().split()[0]
             args = user_input.split()[1:]
             try:
-                if command == "l":
+                if command == 'l':
                     print()
                     self._ds.list(self._ds.download_dir, header=False, more_prompt=True)
-                elif command == "h":
+                elif command == 'h':
                     self._simple_interactive_help()
-                elif command == "c":
+                elif command == 'c':
                     self._simple_interactive_config()
-                elif command in ("q", "x"):
+                elif command in ('q', 'x'):
                     return
-                elif command == "d":
+                elif command == 'd':
                     self._simple_interactive_download(args)
-                elif command == "u":
+                elif command == 'u':
                     self._simple_interactive_update()
                 else:
-                    print("Command %r unrecognized" % user_input)
+                    print('Command %r unrecognized' % user_input)
             except HTTPError as e:
-                print("Error reading from server: %s" % e)
+                print('Error reading from server: %s' % e)
             except URLError as e:
-                print("Error connecting to server: %s" % e.reason)
+                print('Error connecting to server: %s' % e.reason)
             # try checking if user_input is a package name, &
             # downloading it?
             print()
@@ -1174,15 +1185,15 @@ class DownloaderShell(object):
         if args:
             for arg in args:
                 try:
-                    self._ds.download(arg, prefix="    ")
+                    self._ds.download(arg, prefix='    ')
                 except (IOError, ValueError) as e:
                     print(e)
         else:
             while True:
                 print()
-                print("Download which package (l=list; x=cancel)?")
-                user_input = input("  Identifier> ")
-                if user_input.lower() == "l":
+                print('Download which package (l=list; x=cancel)?')
+                user_input = input('  Identifier> ')
+                if user_input.lower() == 'l':
                     self._ds.list(
                         self._ds.download_dir,
                         header=False,
@@ -1190,12 +1201,12 @@ class DownloaderShell(object):
                         skip_installed=True,
                     )
                     continue
-                elif user_input.lower() in ("x", "q", ""):
+                elif user_input.lower() in ('x', 'q', ''):
                     return
                 elif user_input:
                     for id in user_input.split():
                         try:
-                            self._ds.download(id, prefix="    ")
+                            self._ds.download(id, prefix='    ')
                         except (IOError, ValueError) as e:
                             print(e)
                     break
@@ -1204,83 +1215,83 @@ class DownloaderShell(object):
         while True:
             stale_packages = []
             stale = partial = False
-            for info in sorted(getattr(self._ds, "packages")(), key=str):
+            for info in sorted(getattr(self._ds, 'packages')(), key=str):
                 if self._ds.status(info) == self._ds.STALE:
                     stale_packages.append((info.id, info.name))
 
             print()
             if stale_packages:
-                print("Will update following packages (o=ok; x=cancel)")
+                print('Will update following packages (o=ok; x=cancel)')
                 for pid, pname in stale_packages:
                     name = textwrap.fill(
-                        "-" * 27 + (pname), 75, subsequent_indent=27 * " "
+                        '-' * 27 + (pname), 75, subsequent_indent=27 * ' '
                     )[27:]
-                    print("  [ ] %s %s" % (pid.ljust(20, "."), name))
+                    print('  [ ] %s %s' % (pid.ljust(20, '.'), name))
                 print()
 
-                user_input = input("  Identifier> ")
-                if user_input.lower() == "o":
+                user_input = input('  Identifier> ')
+                if user_input.lower() == 'o':
                     for pid, pname in stale_packages:
                         try:
-                            self._ds.download(pid, prefix="    ")
+                            self._ds.download(pid, prefix='    ')
                         except (IOError, ValueError) as e:
                             print(e)
                     break
-                elif user_input.lower() in ("x", "q", ""):
+                elif user_input.lower() in ('x', 'q', ''):
                     return
             else:
-                print("Nothing to update.")
+                print('Nothing to update.')
                 return
 
     def _simple_interactive_help(self):
         print()
-        print("Commands:")
+        print('Commands:')
         print(
-            "  d) Download a package or collection     u) Update out of date packages"
+            '  d) Download a package or collection     u) Update out of date packages'
         )
-        print("  l) List packages & collections          h) Help")
-        print("  c) View & Modify Configuration          q) Quit")
+        print('  l) List packages & collections          h) Help')
+        print('  c) View & Modify Configuration          q) Quit')
 
     def _show_config(self):
         print()
-        print("Data Server:")
-        print("  - URL: <%s>" % self._ds.url)
-        print(("  - %d Package Collections Available" % len(self._ds.collections())))
-        print(("  - %d Individual Packages Available" % len(self._ds.packages())))
+        print('Data Server:')
+        print('  - URL: <%s>' % self._ds.url)
+        print(('  - %d Package Collections Available' % len(self._ds.collections())))
+        print(('  - %d Individual Packages Available' % len(self._ds.packages())))
         print()
-        print("Local Machine:")
-        print("  - Data directory: %s" % self._ds.download_dir)
+        print('Local Machine:')
+        print('  - Data directory: %s' % self._ds.download_dir)
 
     def _simple_interactive_config(self):
         self._show_config()
         while True:
             print()
             self._simple_interactive_menu(
-                "s) Show Config", "u) Set Server URL", "d) Set Data Dir", "m) Main Menu"
+                's) Show Config', 'u) Set Server URL', 'd) Set Data Dir', 'm) Main Menu'
             )
-            user_input = input("Config> ").strip().lower()
-            if user_input == "s":
+            user_input = input('Config> ').strip().lower()
+            if user_input == 's':
                 self._show_config()
-            elif user_input == "d":
-                new_dl_dir = input("  New Directory> ").strip()
-                if new_dl_dir in ("", "x", "q", "X", "Q"):
-                    print("  Cancelled!")
+            elif user_input == 'd':
+                new_dl_dir = input('  New Directory> ').strip()
+                if new_dl_dir in ('', 'x', 'q', 'X', 'Q'):
+                    print('  Cancelled!')
                 elif os.path.isdir(new_dl_dir):
                     self._ds.download_dir = new_dl_dir
                 else:
-                    print(("Directory %r not found!  Create it first." % new_dl_dir))
-            elif user_input == "u":
-                new_url = input("  New URL> ").strip()
-                if new_url in ("", "x", "q", "X", "Q"):
-                    print("  Cancelled!")
+                    print(('Directory %r not found!  Create it first.' % new_dl_dir))
+            elif user_input == 'u':
+                new_url = input('  New URL> ').strip()
+                if new_url in ('', 'x', 'q', 'X', 'Q'):
+                    print('  Cancelled!')
                 else:
-                    if not new_url.startswith(("http://", "https://")):
-                        new_url = "http://" + new_url
+                    if not new_url.startswith(('http://', 'https://')):
+                        new_url = 'http://' + new_url
                     try:
                         self._ds.url = new_url
                     except Exception as e:
-                        print("Error reading <%r>:\n  %s" % (new_url, e))
-            elif user_input == "m":
+                        print('Error reading <%r>:\n  %s' % (new_url, e))
+            elif user_input == 'm':
                 break
 
 
@@ -1295,36 +1306,36 @@ class DownloaderGUI(object):
     # /////////////////////////////////////////////////////////////////
 
     COLUMNS = [
-        "",
-        "Identifier",
-        "Name",
-        "Size",
-        "Status",
-        "Unzipped Size",
-        "Copyright",
-        "Contact",
-        "License",
-        "Author",
-        "Subdir",
-        "Checksum",
+        '',
+        'Identifier',
+        'Name',
+        'Size',
+        'Status',
+        'Unzipped Size',
+        'Copyright',
+        'Contact',
+        'License',
+        'Author',
+        'Subdir',
+        'Checksum',
     ]
     """A list of the names of columns.  This controls the order in
        which the columns will appear.  If this is edited, then
        ``_package_to_columns()`` may need to be edited to match."""
 
-    COLUMN_WEIGHTS = {"": 0, "Name": 5, "Size": 0, "Status": 0}
+    COLUMN_WEIGHTS = {'': 0, 'Name': 5, 'Size': 0, 'Status': 0}
     """A dictionary specifying how columns should be resized when the
        table is resized.  Columns with weight 0 will not be resized at
        all; and columns with high weight will be resized more.
        Default weight (for columns not explicitly listed) is 1."""
 
     COLUMN_WIDTHS = {
-        "": 1,
-        "Identifier": 20,
-        "Name": 45,
-        "Size": 10,
-        "Unzipped Size": 10,
-        "Status": 12,
+        '': 1,
+        'Identifier': 20,
+        'Name': 45,
+        'Size': 10,
+        'Unzipped Size': 10,
+        'Status': 12,
     }
     """A dictionary specifying how wide each column should be, in
        characters.  The default width (for columns not explicitly
@@ -1334,7 +1345,7 @@ class DownloaderGUI(object):
     """The default width for columns that are not explicitly listed
        in ``COLUMN_WIDTHS``."""
 
-    INITIAL_COLUMNS = ["", "Identifier", "Name", "Size", "Status"]
+    INITIAL_COLUMNS = ['', 'Identifier', 'Name', 'Size', 'Status']
     """The set of columns that should be displayed by default."""
 
     # Perform a few import-time sanity checks to make sure that the
@@ -1350,25 +1361,25 @@ class DownloaderGUI(object):
     # Color Configuration
     # /////////////////////////////////////////////////////////////////
 
-    _BACKDROP_COLOR = ("#000", "#ccc")
+    _BACKDROP_COLOR = ('#000', '#ccc')
 
     _ROW_COLOR = {
-        Downloader.INSTALLED: ("#afa", "#080"),
-        Downloader.PARTIAL: ("#ffa", "#880"),
-        Downloader.STALE: ("#faa", "#800"),
-        Downloader.NOT_INSTALLED: ("#fff", "#888"),
+        Downloader.INSTALLED: ('#afa', '#080'),
+        Downloader.PARTIAL: ('#ffa', '#880'),
+        Downloader.STALE: ('#faa', '#800'),
+        Downloader.NOT_INSTALLED: ('#fff', '#888'),
     }
 
-    _MARK_COLOR = ("#000", "#ccc")
+    _MARK_COLOR = ('#000', '#ccc')
 
     # _FRONT_TAB_COLOR = ('#ccf', '#008')
     # _BACK_TAB_COLOR = ('#88a', '#448')
-    _FRONT_TAB_COLOR = ("#fff", "#45c")
-    _BACK_TAB_COLOR = ("#aaa", "#67a")
+    _FRONT_TAB_COLOR = ('#fff', '#45c')
+    _BACK_TAB_COLOR = ('#aaa', '#67a')
 
-    _PROGRESS_COLOR = ("#f00", "#aaa")
+    _PROGRESS_COLOR = ('#f00', '#aaa')
 
-    _TAB_FONT = "helvetica -16 bold"
+    _TAB_FONT = 'helvetica -16 bold'
 
     # /////////////////////////////////////////////////////////////////
     # Constructor
@@ -1390,17 +1401,17 @@ class DownloaderGUI(object):
         # A message log.
         self._log_messages = []
         self._log_indent = 0
-        self._log("NLTK Downloader Started!")
+        self._log('NLTK Downloader Started!')
 
         # Create the main window.
         top = self.top = Tk()
-        top.geometry("+50+50")
-        top.title("NLTK Downloader")
+        top.geometry('+50+50')
+        top.title('NLTK Downloader')
         top.configure(background=self._BACKDROP_COLOR[1])
 
         # Set up some bindings now, in case anything goes wrong.
-        top.bind("<Control-q>", self.destroy)
-        top.bind("<Control-x>", self.destroy)
+        top.bind('<Control-q>', self.destroy)
+        top.bind('<Control-x>', self.destroy)
         self._destroyed = False
 
         self._column_vars = {}
@@ -1411,9 +1422,9 @@ class DownloaderGUI(object):
         try:
             self._fill_table()
         except HTTPError as e:
-            showerror("Error reading from server", e)
+            showerror('Error reading from server', e)
         except URLError as e:
-            showerror("Error connecting to server", e.reason)
+            showerror('Error connecting to server', e.reason)
 
         self._show_info()
         self._select_columns()
@@ -1421,11 +1432,11 @@ class DownloaderGUI(object):
 
         # Make sure we get notified when we're destroyed, so we can
         # cancel any download in progress.
-        self._table.bind("<Destroy>", self._destroy)
+        self._table.bind('<Destroy>', self._destroy)
 
     def _log(self, msg):
         self._log_messages.append(
-            "%s %s%s" % (time.ctime(), " | " * self._log_indent, msg)
+            '%s %s%s' % (time.ctime(), ' | ' * self._log_indent, msg)
         )
 
     # /////////////////////////////////////////////////////////////////
@@ -1434,35 +1445,35 @@ class DownloaderGUI(object):
 
     def _init_widgets(self):
         # Create the top-level frame structures
-        f1 = Frame(self.top, relief="raised", border=2, padx=8, pady=0)
-        f1.pack(sid="top", expand=True, fill="both")
+        f1 = Frame(self.top, relief='raised', border=2, padx=8, pady=0)
+        f1.pack(sid='top', expand=True, fill='both')
         f1.grid_rowconfigure(2, weight=1)
         f1.grid_columnconfigure(0, weight=1)
         Frame(f1, height=8).grid(column=0, row=0)  # spacer
         tabframe = Frame(f1)
-        tabframe.grid(column=0, row=1, sticky="news")
+        tabframe.grid(column=0, row=1, sticky='news')
         tableframe = Frame(f1)
-        tableframe.grid(column=0, row=2, sticky="news")
+        tableframe.grid(column=0, row=2, sticky='news')
         buttonframe = Frame(f1)
-        buttonframe.grid(column=0, row=3, sticky="news")
+        buttonframe.grid(column=0, row=3, sticky='news')
         Frame(f1, height=8).grid(column=0, row=4)  # spacer
         infoframe = Frame(f1)
-        infoframe.grid(column=0, row=5, sticky="news")
+        infoframe.grid(column=0, row=5, sticky='news')
         Frame(f1, height=8).grid(column=0, row=6)  # spacer
         progressframe = Frame(
             self.top, padx=3, pady=3, background=self._BACKDROP_COLOR[1]
         )
-        progressframe.pack(side="bottom", fill="x")
-        self.top["border"] = 0
-        self.top["highlightthickness"] = 0
+        progressframe.pack(side='bottom', fill='x')
+        self.top['border'] = 0
+        self.top['highlightthickness'] = 0
 
         # Create the tabs
-        self._tab_names = ["Collections", "Corpora", "Models", "All Packages"]
+        self._tab_names = ['Collections', 'Corpora', 'Models', 'All Packages']
         self._tabs = {}
         for i, tab in enumerate(self._tab_names):
             label = Label(tabframe, text=tab, font=self._TAB_FONT)
-            label.pack(side="left", padx=((i + 1) % 2) * 10)
-            label.bind("<Button-1>", self._select_tab)
+            label.pack(side='left', padx=((i + 1) % 2) * 10)
+            label.bind('<Button-1>', self._select_tab)
             self._tabs[tab.lower()] = label
 
         # Create the table.
@@ -1479,51 +1490,51 @@ class DownloaderGUI(object):
         for i, column in enumerate(self.COLUMNS):
             width = self.COLUMN_WIDTHS.get(column, self.DEFAULT_COLUMN_WIDTH)
             self._table.columnconfig(i, width=width)
-        self._table.pack(expand=True, fill="both")
+        self._table.pack(expand=True, fill='both')
         self._table.focus()
-        self._table.bind_to_listboxes("<Double-Button-1>", self._download)
-        self._table.bind("<space>", self._table_mark)
-        self._table.bind("<Return>", self._download)
-        self._table.bind("<Left>", self._prev_tab)
-        self._table.bind("<Right>", self._next_tab)
-        self._table.bind("<Control-a>", self._mark_all)
+        self._table.bind_to_listboxes('<Double-Button-1>', self._download)
+        self._table.bind('<space>', self._table_mark)
+        self._table.bind('<Return>', self._download)
+        self._table.bind('<Left>', self._prev_tab)
+        self._table.bind('<Right>', self._next_tab)
+        self._table.bind('<Control-a>', self._mark_all)
 
         # Create entry boxes for URL & download_dir
         infoframe.grid_columnconfigure(1, weight=1)
 
         info = [
-            ("url", "Server Index:", self._set_url),
-            ("download_dir", "Download Directory:", self._set_download_dir),
+            ('url', 'Server Index:', self._set_url),
+            ('download_dir', 'Download Directory:', self._set_download_dir),
         ]
         self._info = {}
         for (i, (key, label, callback)) in enumerate(info):
-            Label(infoframe, text=label).grid(column=0, row=i, sticky="e")
+            Label(infoframe, text=label).grid(column=0, row=i, sticky='e')
             entry = Entry(
-                infoframe, font="courier", relief="groove", disabledforeground="black"
+                infoframe, font='courier', relief='groove', disabledforeground='black'
             )
             self._info[key] = (entry, callback)
-            entry.bind("<Return>", self._info_save)
-            entry.bind("<Button-1>", lambda e, key=key: self._info_edit(key))
-            entry.grid(column=1, row=i, sticky="ew")
+            entry.bind('<Return>', self._info_save)
+            entry.bind('<Button-1>', lambda e, key=key: self._info_edit(key))
+            entry.grid(column=1, row=i, sticky='ew')
 
         # If the user edits url or download_dir, and then clicks outside
         # the entry box, then save their results.
-        self.top.bind("<Button-1>", self._info_save)
+        self.top.bind('<Button-1>', self._info_save)
 
         # Create Download & Refresh buttons.
         self._download_button = Button(
-            buttonframe, text="Download", command=self._download, width=8
+            buttonframe, text='Download', command=self._download, width=8
         )
-        self._download_button.pack(side="left")
+        self._download_button.pack(side='left')
         self._refresh_button = Button(
-            buttonframe, text="Refresh", command=self._refresh, width=8
+            buttonframe, text='Refresh', command=self._refresh, width=8
         )
-        self._refresh_button.pack(side="right")
+        self._refresh_button.pack(side='right')
 
         # Create Progress bar
         self._progresslabel = Label(
             progressframe,
-            text="",
+            text='',
             foreground=self._BACKDROP_COLOR[0],
             background=self._BACKDROP_COLOR[1],
         )
@@ -1532,38 +1543,38 @@ class DownloaderGUI(object):
             width=200,
             height=16,
             background=self._PROGRESS_COLOR[1],
-            relief="sunken",
+            relief='sunken',
             border=1,
         )
         self._init_progressbar()
-        self._progressbar.pack(side="right")
-        self._progresslabel.pack(side="left")
+        self._progressbar.pack(side='right')
+        self._progresslabel.pack(side='left')
 
     def _init_menu(self):
         menubar = Menu(self.top)
 
         filemenu = Menu(menubar, tearoff=0)
         filemenu.add_command(
-            label="Download", underline=0, command=self._download, accelerator="Return"
+            label='Download', underline=0, command=self._download, accelerator='Return'
         )
         filemenu.add_separator()
         filemenu.add_command(
-            label="Change Server Index",
+            label='Change Server Index',
             underline=7,
-            command=lambda: self._info_edit("url"),
+            command=lambda: self._info_edit('url'),
         )
         filemenu.add_command(
-            label="Change Download Directory",
+            label='Change Download Directory',
             underline=0,
-            command=lambda: self._info_edit("download_dir"),
+            command=lambda: self._info_edit('download_dir'),
         )
         filemenu.add_separator()
-        filemenu.add_command(label="Show Log", underline=5, command=self._show_log)
+        filemenu.add_command(label='Show Log', underline=5, command=self._show_log)
         filemenu.add_separator()
         filemenu.add_command(
-            label="Exit", underline=1, command=self.destroy, accelerator="Ctrl-x"
+            label='Exit', underline=1, command=self.destroy, accelerator='Ctrl-x'
         )
-        menubar.add_cascade(label="File", underline=0, menu=filemenu)
+        menubar.add_cascade(label='File', underline=0, menu=filemenu)
 
         # Create a menu to control which columns of the table are
         # shown.  n.b.: we never hide the first two columns (mark and
@@ -1578,7 +1589,7 @@ class DownloaderGUI(object):
             viewmenu.add_checkbutton(
                 label=column, underline=0, variable=var, command=self._select_columns
             )
-        menubar.add_cascade(label="View", underline=0, menu=viewmenu)
+        menubar.add_cascade(label='View', underline=0, menu=viewmenu)
 
         # Create a sort menu
         # [xx] this should be selectbuttons; and it should include
@@ -1586,25 +1597,25 @@ class DownloaderGUI(object):
         sortmenu = Menu(menubar, tearoff=0)
         for column in self._table.column_names[1:]:
             sortmenu.add_command(
-                label="Sort by %s" % column,
-                command=(lambda c=column: self._table.sort_by(c, "ascending")),
+                label='Sort by %s' % column,
+                command=(lambda c=column: self._table.sort_by(c, 'ascending')),
             )
         sortmenu.add_separator()
         # sortmenu.add_command(label='Descending Sort:')
         for column in self._table.column_names[1:]:
             sortmenu.add_command(
-                label="Reverse sort by %s" % column,
-                command=(lambda c=column: self._table.sort_by(c, "descending")),
+                label='Reverse sort by %s' % column,
+                command=(lambda c=column: self._table.sort_by(c, 'descending')),
             )
-        menubar.add_cascade(label="Sort", underline=0, menu=sortmenu)
+        menubar.add_cascade(label='Sort', underline=0, menu=sortmenu)
 
         helpmenu = Menu(menubar, tearoff=0)
-        helpmenu.add_command(label="About", underline=0, command=self.about)
+        helpmenu.add_command(label='About', underline=0, command=self.about)
         helpmenu.add_command(
-            label="Instructions", underline=0, command=self.help, accelerator="F1"
+            label='Instructions', underline=0, command=self.help, accelerator='F1'
         )
-        menubar.add_cascade(label="Help", underline=0, menu=helpmenu)
-        self.top.bind("<F1>", self.help)
+        menubar.add_cascade(label='Help', underline=0, menu=helpmenu)
+        self.top.bind('<F1>', self.help)
 
         self.top.config(menu=menubar)
 
@@ -1620,46 +1631,46 @@ class DownloaderGUI(object):
         try:
             self._fill_table()
         except HTTPError as e:
-            showerror("Error reading from server", e)
+            showerror('Error reading from server', e)
         except URLError as e:
-            showerror("Error connecting to server", e.reason)
+            showerror('Error connecting to server', e.reason)
         self._table.select(0)
 
     def _info_edit(self, info_key):
         self._info_save()  # just in case.
         (entry, callback) = self._info[info_key]
-        entry["state"] = "normal"
-        entry["relief"] = "sunken"
+        entry['state'] = 'normal'
+        entry['relief'] = 'sunken'
         entry.focus()
 
     def _info_save(self, e=None):
         focus = self._table
         for entry, callback in self._info.values():
-            if entry["state"] == "disabled":
+            if entry['state'] == 'disabled':
                 continue
-            if e is not None and e.widget is entry and e.keysym != "Return":
+            if e is not None and e.widget is entry and e.keysym != 'Return':
                 focus = entry
             else:
-                entry["state"] = "disabled"
-                entry["relief"] = "groove"
+                entry['state'] = 'disabled'
+                entry['relief'] = 'groove'
                 callback(entry.get())
         focus.focus()
 
     def _table_reprfunc(self, row, col, val):
-        if self._table.column_names[col].endswith("Size"):
-            if isinstance(val, str):
-                return "  %s" % val
+        if self._table.column_names[col].endswith('Size'):
+            if isinstance(val, string_types):
+                return '  %s' % val
             elif val < 1024 ** 2:
-                return "  %.1f KB" % (val / 1024.0 ** 1)
+                return '  %.1f KB' % (val / 1024.0 ** 1)
             elif val < 1024 ** 3:
-                return "  %.1f MB" % (val / 1024.0 ** 2)
+                return '  %.1f MB' % (val / 1024.0 ** 2)
             else:
-                return "  %.1f GB" % (val / 1024.0 ** 3)
+                return '  %.1f GB' % (val / 1024.0 ** 3)
 
-        if col in (0, ""):
+        if col in (0, ''):
             return str(val)
         else:
-            return "  %s" % val
+            return '  %s' % val
 
     def _set_url(self, url):
         if url == self._ds.url:
@@ -1668,7 +1679,7 @@ class DownloaderGUI(object):
             self._ds.url = url
             self._fill_table()
         except IOError as e:
-            showerror("Error Setting Server Index", str(e))
+            showerror('Error Setting Server Index', str(e))
         self._show_info()
 
     def _set_download_dir(self, download_dir):
@@ -1681,20 +1692,20 @@ class DownloaderGUI(object):
         try:
             self._fill_table()
         except HTTPError as e:
-            showerror("Error reading from server", e)
+            showerror('Error reading from server', e)
         except URLError as e:
-            showerror("Error connecting to server", e.reason)
+            showerror('Error connecting to server', e.reason)
         self._show_info()
 
     def _show_info(self):
-        print("showing info", self._ds.url)
+        print('showing info', self._ds.url)
         for entry, cb in self._info.values():
-            entry["state"] = "normal"
-            entry.delete(0, "end")
-        self._info["url"][0].insert(0, self._ds.url)
-        self._info["download_dir"][0].insert(0, self._ds.download_dir)
+            entry['state'] = 'normal'
+            entry.delete(0, 'end')
+        self._info['url'][0].insert(0, self._ds.url)
+        self._info['download_dir'][0].insert(0, self._ds.download_dir)
         for entry, cb in self._info.values():
-            entry["state"] = "disabled"
+            entry['state'] = 'disabled'
 
     def _prev_tab(self, *e):
         for i, tab in enumerate(self._tab_names):
@@ -1703,9 +1714,9 @@ class DownloaderGUI(object):
                 try:
                     return self._fill_table()
                 except HTTPError as e:
-                    showerror("Error reading from server", e)
+                    showerror('Error reading from server', e)
                 except URLError as e:
-                    showerror("Error connecting to server", e.reason)
+                    showerror('Error connecting to server', e.reason)
 
     def _next_tab(self, *e):
         for i, tab in enumerate(self._tab_names):
@@ -1714,36 +1725,36 @@ class DownloaderGUI(object):
                 try:
                     return self._fill_table()
                 except HTTPError as e:
-                    showerror("Error reading from server", e)
+                    showerror('Error reading from server', e)
                 except URLError as e:
-                    showerror("Error connecting to server", e.reason)
+                    showerror('Error connecting to server', e.reason)
 
     def _select_tab(self, event):
-        self._tab = event.widget["text"].lower()
+        self._tab = event.widget['text'].lower()
         try:
             self._fill_table()
         except HTTPError as e:
-            showerror("Error reading from server", e)
+            showerror('Error reading from server', e)
         except URLError as e:
-            showerror("Error connecting to server", e.reason)
+            showerror('Error connecting to server', e.reason)
 
-    _tab = "collections"
+    _tab = 'collections'
     # _tab = 'corpora'
     _rows = None
 
     def _fill_table(self):
         selected_row = self._table.selected_row()
         self._table.clear()
-        if self._tab == "all packages":
+        if self._tab == 'all packages':
             items = self._ds.packages()
-        elif self._tab == "corpora":
+        elif self._tab == 'corpora':
             items = self._ds.corpora()
-        elif self._tab == "models":
+        elif self._tab == 'models':
             items = self._ds.models()
-        elif self._tab == "collections":
+        elif self._tab == 'collections':
             items = self._ds.collections()
         else:
-            assert 0, "bad tab value %r" % self._tab
+            assert 0, 'bad tab value %r' % self._tab
         rows = [self._package_to_columns(item) for item in items]
         self._table.extend(rows)
 
@@ -1760,7 +1771,7 @@ class DownloaderGUI(object):
                     background=self._BACK_TAB_COLOR[1],
                 )
 
-        self._table.sort_by("Identifier", order="ascending")
+        self._table.sort_by('Identifier', order='ascending')
         self._color_table()
         self._table.select(selected_row)
 
@@ -1774,8 +1785,8 @@ class DownloaderGUI(object):
 
     def _update_table_status(self):
         for row_num in range(len(self._table)):
-            status = self._ds.status(self._table[row_num, "Identifier"])
-            self._table[row_num, "Status"] = status
+            status = self._ds.status(self._table[row_num, 'Identifier'])
+            self._table[row_num, 'Status'] = status
         self._color_table()
 
     def _download(self, *e):
@@ -1785,13 +1796,13 @@ class DownloaderGUI(object):
             return self._download_threaded(*e)
 
         marked = [
-            self._table[row, "Identifier"]
+            self._table[row, 'Identifier']
             for row in range(len(self._table))
-            if self._table[row, 0] != ""
+            if self._table[row, 0] != ''
         ]
         selection = self._table.selected_row()
         if not marked and selection is not None:
-            marked = [self._table[selection, "Identifier"]]
+            marked = [self._table[selection, 'Identifier']]
 
         download_iter = self._ds.incr_download(marked, self._ds.download_dir)
         self._log_indent = 0
@@ -1806,11 +1817,11 @@ class DownloaderGUI(object):
             # self._fill_table(sort=False)
             self._update_table_status()
             afterid = self.top.after(10, self._show_progress, 0)
-            self._afterid["_download_cb"] = afterid
+            self._afterid['_download_cb'] = afterid
             return
 
         def show(s):
-            self._progresslabel["text"] = s
+            self._progresslabel['text'] = s
             self._log(s)
 
         if isinstance(msg, ProgressMessage):
@@ -1822,38 +1833,38 @@ class DownloaderGUI(object):
             self._show_progress(None)
             return  # halt progress.
         elif isinstance(msg, StartCollectionMessage):
-            show("Downloading collection %s" % msg.collection.id)
+            show('Downloading collection %s' % msg.collection.id)
             self._log_indent += 1
         elif isinstance(msg, StartPackageMessage):
-            show("Downloading package %s" % msg.package.id)
+            show('Downloading package %s' % msg.package.id)
         elif isinstance(msg, UpToDateMessage):
-            show("Package %s is up-to-date!" % msg.package.id)
+            show('Package %s is up-to-date!' % msg.package.id)
         # elif isinstance(msg, StaleMessage):
         #    show('Package %s is out-of-date or corrupt' % msg.package.id)
         elif isinstance(msg, FinishDownloadMessage):
-            show("Finished downloading %r." % msg.package.id)
+            show('Finished downloading %r.' % msg.package.id)
         elif isinstance(msg, StartUnzipMessage):
-            show("Unzipping %s" % msg.package.filename)
+            show('Unzipping %s' % msg.package.filename)
         elif isinstance(msg, FinishCollectionMessage):
             self._log_indent -= 1
-            show("Finished downloading collection %r." % msg.collection.id)
+            show('Finished downloading collection %r.' % msg.collection.id)
             self._clear_mark(msg.collection.id)
         elif isinstance(msg, FinishPackageMessage):
             self._clear_mark(msg.package.id)
         afterid = self.top.after(self._DL_DELAY, self._download_cb, download_iter, ids)
-        self._afterid["_download_cb"] = afterid
+        self._afterid['_download_cb'] = afterid
 
     def _select(self, id):
         for row in range(len(self._table)):
-            if self._table[row, "Identifier"] == id:
+            if self._table[row, 'Identifier'] == id:
                 self._table.select(row)
                 return
 
     def _color_table(self):
         # Color rows according to status.
         for row in range(len(self._table)):
-            bg, sbg = self._ROW_COLOR[self._table[row, "Status"]]
-            fg, sfg = ("black", "white")
+            bg, sbg = self._ROW_COLOR[self._table[row, 'Status']]
+            fg, sfg = ('black', 'white')
             self._table.rowconfig(
                 row,
                 foreground=fg,
@@ -1868,25 +1879,25 @@ class DownloaderGUI(object):
 
     def _clear_mark(self, id):
         for row in range(len(self._table)):
-            if self._table[row, "Identifier"] == id:
-                self._table[row, 0] = ""
+            if self._table[row, 'Identifier'] == id:
+                self._table[row, 0] = ''
 
     def _mark_all(self, *e):
         for row in range(len(self._table)):
-            self._table[row, 0] = "X"
+            self._table[row, 0] = 'X'
 
     def _table_mark(self, *e):
         selection = self._table.selected_row()
         if selection >= 0:
-            if self._table[selection][0] != "":
-                self._table[selection, 0] = ""
+            if self._table[selection][0] != '':
+                self._table[selection, 0] = ''
             else:
-                self._table[selection, 0] = "X"
+                self._table[selection, 0] = 'X'
         self._table.select(delta=1)
 
     def _show_log(self):
-        text = "\n".join(self._log_messages)
-        ShowText(self.top, "NLTK Downloader Log", text)
+        text = '\n'.join(self._log_messages)
+        ShowText(self.top, 'NLTK Downloader Log', text)
 
     def _package_to_columns(self, pkg):
         """
@@ -1896,14 +1907,14 @@ class DownloaderGUI(object):
         row = []
         for column_index, column_name in enumerate(self.COLUMNS):
             if column_index == 0:  # Mark:
-                row.append("")
-            elif column_name == "Identifier":
+                row.append('')
+            elif column_name == 'Identifier':
                 row.append(pkg.id)
-            elif column_name == "Status":
+            elif column_name == 'Status':
                 row.append(self._ds.status(pkg))
             else:
-                attr = column_name.lower().replace(" ", "_")
-                row.append(getattr(pkg, attr, "n/a"))
+                attr = column_name.lower().replace(' ', '_')
+                row.append(getattr(pkg, attr, 'n/a'))
         return row
 
     # /////////////////////////////////////////////////////////////////
@@ -1971,19 +1982,19 @@ class DownloaderGUI(object):
         try:
             ShowText(
                 self.top,
-                "Help: NLTK Dowloader",
+                'Help: NLTK Dowloader',
                 self.HELP.strip(),
                 width=75,
-                font="fixed",
+                font='fixed',
             )
         except:
-            ShowText(self.top, "Help: NLTK Downloader", self.HELP.strip(), width=75)
+            ShowText(self.top, 'Help: NLTK Downloader', self.HELP.strip(), width=75)
 
     def about(self, *e):
         ABOUT = "NLTK Downloader\n" + "Written by Edward Loper"
-        TITLE = "About: NLTK Downloader"
+        TITLE = 'About: NLTK Downloader'
         try:
-            from tkinter.messagebox import Message
+            from six.moves.tkinter_messagebox import Message
 
             Message(message=ABOUT, title=TITLE).show()
         except ImportError:
@@ -1997,47 +2008,47 @@ class DownloaderGUI(object):
 
     def _init_progressbar(self):
         c = self._progressbar
-        width, height = int(c["width"]), int(c["height"])
-        for i in range(0, (int(c["width"]) * 2) // self._gradient_width):
+        width, height = int(c['width']), int(c['height'])
+        for i in range(0, (int(c['width']) * 2) // self._gradient_width):
             c.create_line(
                 i * self._gradient_width + 20,
                 -20,
                 i * self._gradient_width - height - 20,
                 height + 20,
                 width=self._gradient_width,
-                fill="#%02x0000" % (80 + abs(i % 6 - 3) * 12),
+                fill='#%02x0000' % (80 + abs(i % 6 - 3) * 12),
             )
-        c.addtag_all("gradient")
-        c.itemconfig("gradient", state="hidden")
+        c.addtag_all('gradient')
+        c.itemconfig('gradient', state='hidden')
 
         # This is used to display progress
         c.addtag_withtag(
-            "redbox", c.create_rectangle(0, 0, 0, 0, fill=self._PROGRESS_COLOR[0])
+            'redbox', c.create_rectangle(0, 0, 0, 0, fill=self._PROGRESS_COLOR[0])
         )
 
     def _show_progress(self, percent):
         c = self._progressbar
         if percent is None:
-            c.coords("redbox", 0, 0, 0, 0)
-            c.itemconfig("gradient", state="hidden")
+            c.coords('redbox', 0, 0, 0, 0)
+            c.itemconfig('gradient', state='hidden')
         else:
-            width, height = int(c["width"]), int(c["height"])
+            width, height = int(c['width']), int(c['height'])
             x = percent * int(width) // 100 + 1
-            c.coords("redbox", 0, 0, x, height + 1)
+            c.coords('redbox', 0, 0, x, height + 1)
 
     def _progress_alive(self):
         c = self._progressbar
         if not self._downloading:
-            c.itemconfig("gradient", state="hidden")
+            c.itemconfig('gradient', state='hidden')
         else:
-            c.itemconfig("gradient", state="normal")
-            x1, y1, x2, y2 = c.bbox("gradient")
+            c.itemconfig('gradient', state='normal')
+            x1, y1, x2, y2 = c.bbox('gradient')
             if x1 <= -100:
-                c.move("gradient", (self._gradient_width * 6) - 4, 0)
+                c.move('gradient', (self._gradient_width * 6) - 4, 0)
             else:
-                c.move("gradient", -4, 0)
+                c.move('gradient', -4, 0)
             afterid = self.top.after(200, self._progress_alive)
-            self._afterid["_progress_alive"] = afterid
+            self._afterid['_progress_alive'] = afterid
 
     # /////////////////////////////////////////////////////////////////
     # Threaded downloader
@@ -2051,16 +2062,16 @@ class DownloaderGUI(object):
             return
 
         # Change the 'download' button to an 'abort' button.
-        self._download_button["text"] = "Cancel"
+        self._download_button['text'] = 'Cancel'
 
         marked = [
-            self._table[row, "Identifier"]
+            self._table[row, 'Identifier']
             for row in range(len(self._table))
-            if self._table[row, 0] != ""
+            if self._table[row, 0] != ''
         ]
         selection = self._table.selected_row()
         if not marked and selection is not None:
-            marked = [self._table[selection, "Identifier"]]
+            marked = [self._table[selection, 'Identifier']]
 
         # Create a new data server object for the download operation,
         # just in case the user modifies our data server during the
@@ -2090,7 +2101,7 @@ class DownloaderGUI(object):
     def _abort_download(self):
         if self._downloading:
             self._download_lock.acquire()
-            self._download_abort_queue.append("abort")
+            self._download_abort_queue.append('abort')
             self._download_lock.release()
 
     class _DownloadThread(threading.Thread):
@@ -2108,19 +2119,19 @@ class DownloaderGUI(object):
                 self.message_queue.append(msg)
                 # Check if we've been told to kill ourselves:
                 if self.abort:
-                    self.message_queue.append("aborted")
+                    self.message_queue.append('aborted')
                     self.lock.release()
                     return
                 self.lock.release()
             self.lock.acquire()
-            self.message_queue.append("finished")
+            self.message_queue.append('finished')
             self.lock.release()
 
     _MONITOR_QUEUE_DELAY = 100
 
     def _monitor_message_queue(self):
         def show(s):
-            self._progresslabel["text"] = s
+            self._progresslabel['text'] = s
             self._log(s)
 
         # Try to acquire the lock; if it's busy, then just try again later.
@@ -2129,20 +2140,20 @@ class DownloaderGUI(object):
         for msg in self._download_msg_queue:
 
             # Done downloading?
-            if msg == "finished" or msg == "aborted":
+            if msg == 'finished' or msg == 'aborted':
                 # self._fill_table(sort=False)
                 self._update_table_status()
                 self._downloading = False
-                self._download_button["text"] = "Download"
+                self._download_button['text'] = 'Download'
                 del self._download_msg_queue[:]
                 del self._download_abort_queue[:]
                 self._download_lock.release()
-                if msg == "aborted":
-                    show("Download aborted!")
+                if msg == 'aborted':
+                    show('Download aborted!')
                     self._show_progress(None)
                 else:
                     afterid = self.top.after(100, self._show_progress, None)
-                    self._afterid["_monitor_message_queue"] = afterid
+                    self._afterid['_monitor_message_queue'] = afterid
                 return
 
             # All other messages
@@ -2156,25 +2167,25 @@ class DownloaderGUI(object):
                 self._downloading = False
                 return  # halt progress.
             elif isinstance(msg, StartCollectionMessage):
-                show("Downloading collection %r" % msg.collection.id)
+                show('Downloading collection %r' % msg.collection.id)
                 self._log_indent += 1
             elif isinstance(msg, StartPackageMessage):
                 self._ds.clear_status_cache(msg.package.id)
-                show("Downloading package %r" % msg.package.id)
+                show('Downloading package %r' % msg.package.id)
             elif isinstance(msg, UpToDateMessage):
-                show("Package %s is up-to-date!" % msg.package.id)
+                show('Package %s is up-to-date!' % msg.package.id)
             # elif isinstance(msg, StaleMessage):
             #    show('Package %s is out-of-date or corrupt; updating it' %
             #         msg.package.id)
             elif isinstance(msg, FinishDownloadMessage):
-                show("Finished downloading %r." % msg.package.id)
+                show('Finished downloading %r.' % msg.package.id)
             elif isinstance(msg, StartUnzipMessage):
-                show("Unzipping %s" % msg.package.filename)
+                show('Unzipping %s' % msg.package.filename)
             elif isinstance(msg, FinishUnzipMessage):
-                show("Finished installing %s" % msg.package.id)
+                show('Finished installing %s' % msg.package.id)
             elif isinstance(msg, FinishCollectionMessage):
                 self._log_indent -= 1
-                show("Finished downloading collection %r." % msg.collection.id)
+                show('Finished downloading collection %r.' % msg.collection.id)
                 self._clear_mark(msg.collection.id)
             elif isinstance(msg, FinishPackageMessage):
                 self._update_table_status()
@@ -2184,7 +2195,7 @@ class DownloaderGUI(object):
         # waiting for a good point to abort it, so we don't end up
         # with a partially unzipped package or anything like that).
         if self._download_abort_queue:
-            self._progresslabel["text"] = "Aborting download..."
+            self._progresslabel['text'] = 'Aborting download...'
 
         # Clear the message queue and then release the lock
         del self._download_msg_queue[:]
@@ -2192,7 +2203,7 @@ class DownloaderGUI(object):
 
         # Check the queue again after MONITOR_QUEUE_DELAY msec.
         afterid = self.top.after(self._MONITOR_QUEUE_DELAY, self._monitor_message_queue)
-        self._afterid["_monitor_message_queue"] = afterid
+        self._afterid['_monitor_message_queue'] = afterid
 
 
 ######################################################################
@@ -2206,8 +2217,8 @@ def md5_hexdigest(file):
     Calculate and return the MD5 checksum for a given file.
     ``file`` may either be a filename or an open stream.
     """
-    if isinstance(file, str):
-        with open(file, "rb") as infile:
+    if isinstance(file, string_types):
+        with open(file, 'rb') as infile:
             return _md5_hexdigest(infile)
     return _md5_hexdigest(file)
 
@@ -2237,20 +2248,54 @@ def unzip(filename, root, verbose=True):
 
 def _unzip_iter(filename, root, verbose=True):
     if verbose:
-        sys.stdout.write("Unzipping %s" % os.path.split(filename)[1])
+        sys.stdout.write('Unzipping %s' % os.path.split(filename)[1])
         sys.stdout.flush()
 
     try:
         zf = zipfile.ZipFile(filename)
     except zipfile.error as e:
-        yield ErrorMessage(filename, "Error with downloaded zip file")
+        yield ErrorMessage(filename, 'Error with downloaded zip file')
         return
     except Exception as e:
         yield ErrorMessage(filename, e)
         return
 
-    zf.extractall(root)
+    # Get lists of directories & files
+    namelist = zf.namelist()
+    dirlist = set()
+    for x in namelist:
+        if x.endswith('/'):
+            dirlist.add(x)
+        else:
+            dirlist.add(x.rsplit('/', 1)[0] + '/')
+    filelist = [x for x in namelist if not x.endswith('/')]
+
+    # Create the target directory if it doesn't exist
+    if not os.path.exists(root):
+        os.mkdir(root)
+
+    # Create the directory structure
+    for dirname in sorted(dirlist):
+        pieces = dirname[:-1].split('/')
+        for i in range(len(pieces)):
+            dirpath = os.path.join(root, *pieces[: i + 1])
+            if not os.path.exists(dirpath):
+                os.mkdir(dirpath)
+
+    # Extract files.
+    for i, filename in enumerate(filelist):
+        filepath = os.path.join(root, *filename.split('/'))
+
+        try:
+            with open(filepath, 'wb') as dstfile, zf.open(filename) as srcfile:
+                shutil.copyfileobj(srcfile, dstfile)
+        except Exception as e:
+            yield ErrorMessage(filename, e)
+            return
 
+        if verbose and (i * 10 / len(filelist) > (i - 1) * 10 / len(filelist)):
+            sys.stdout.write('.')
+            sys.stdout.flush()
     if verbose:
         print()
 
@@ -2293,39 +2338,39 @@ def build_index(root, base_url):
     """
     # Find all packages.
     packages = []
-    for pkg_xml, zf, subdir in _find_packages(os.path.join(root, "packages")):
+    for pkg_xml, zf, subdir in _find_packages(os.path.join(root, 'packages')):
         zipstat = os.stat(zf.filename)
-        url = "%s/%s/%s" % (base_url, subdir, os.path.split(zf.filename)[1])
+        url = '%s/%s/%s' % (base_url, subdir, os.path.split(zf.filename)[1])
         unzipped_size = sum(zf_info.file_size for zf_info in zf.infolist())
 
         # Fill in several fields of the package xml with calculated values.
-        pkg_xml.set("unzipped_size", "%s" % unzipped_size)
-        pkg_xml.set("size", "%s" % zipstat.st_size)
-        pkg_xml.set("checksum", "%s" % md5_hexdigest(zf.filename))
-        pkg_xml.set("subdir", subdir)
+        pkg_xml.set('unzipped_size', '%s' % unzipped_size)
+        pkg_xml.set('size', '%s' % zipstat.st_size)
+        pkg_xml.set('checksum', '%s' % md5_hexdigest(zf.filename))
+        pkg_xml.set('subdir', subdir)
         # pkg_xml.set('svn_revision', _svn_revision(zf.filename))
-        if not pkg_xml.get("url"):
-            pkg_xml.set("url", url)
+        if not pkg_xml.get('url'):
+            pkg_xml.set('url', url)
 
         # Record the package.
         packages.append(pkg_xml)
 
     # Find all collections
-    collections = list(_find_collections(os.path.join(root, "collections")))
+    collections = list(_find_collections(os.path.join(root, 'collections')))
 
     # Check that all UIDs are unique
     uids = set()
     for item in packages + collections:
-        if item.get("id") in uids:
-            raise ValueError("Duplicate UID: %s" % item.get("id"))
-        uids.add(item.get("id"))
+        if item.get('id') in uids:
+            raise ValueError('Duplicate UID: %s' % item.get('id'))
+        uids.add(item.get('id'))
 
     # Put it all together
-    top_elt = ElementTree.Element("nltk_data")
-    top_elt.append(ElementTree.Element("packages"))
+    top_elt = ElementTree.Element('nltk_data')
+    top_elt.append(ElementTree.Element('packages'))
     for package in packages:
         top_elt[0].append(package)
-    top_elt.append(ElementTree.Element("collections"))
+    top_elt.append(ElementTree.Element('collections'))
     for collection in collections:
         top_elt[1].append(collection)
 
@@ -2333,7 +2378,7 @@ def build_index(root, base_url):
     return top_elt
 
 
-def _indent_xml(xml, prefix=""):
+def _indent_xml(xml, prefix=''):
     """
     Helper for ``build_index()``: Given an XML ``ElementTree``, modify it
     (and its descendents) ``text`` and ``tail`` attributes to generate
@@ -2341,12 +2386,12 @@ def _indent_xml(xml, prefix=""):
     spaces with respect to its parent.
     """
     if len(xml) > 0:
-        xml.text = (xml.text or "").strip() + "\n" + prefix + "  "
+        xml.text = (xml.text or '').strip() + '\n' + prefix + '  '
         for child in xml:
-            _indent_xml(child, prefix + "  ")
+            _indent_xml(child, prefix + '  ')
         for child in xml[:-1]:
-            child.tail = (child.tail or "").strip() + "\n" + prefix + "  "
-        xml[-1].tail = (xml[-1].tail or "").strip() + "\n" + prefix
+            child.tail = (child.tail or '').strip() + '\n' + prefix + '  '
+        xml[-1].tail = (xml[-1].tail or '').strip() + '\n' + prefix
 
 
 def _check_package(pkg_xml, zipfilename, zf):
@@ -2356,16 +2401,16 @@ def _check_package(pkg_xml, zipfilename, zf):
     """
     # The filename must patch the id given in the XML file.
     uid = os.path.splitext(os.path.split(zipfilename)[1])[0]
-    if pkg_xml.get("id") != uid:
+    if pkg_xml.get('id') != uid:
         raise ValueError(
-            "package identifier mismatch (%s vs %s)" % (pkg_xml.get("id"), uid)
+            'package identifier mismatch (%s vs %s)' % (pkg_xml.get('id'), uid)
         )
 
     # Zip file must expand to a subdir whose name matches uid.
-    if sum((name != uid and not name.startswith(uid + "/")) for name in zf.namelist()):
+    if sum((name != uid and not name.startswith(uid + '/')) for name in zf.namelist()):
         raise ValueError(
-            "Zipfile %s.zip does not expand to a single "
-            "subdirectory %s/" % (uid, uid)
+            'Zipfile %s.zip does not expand to a single '
+            'subdirectory %s/' % (uid, uid)
         )
 
 
@@ -2376,14 +2421,14 @@ def _svn_revision(filename):
     number for a given file (by using ``subprocess`` to run ``svn``).
     """
     p = subprocess.Popen(
-        ["svn", "status", "-v", filename],
+        ['svn', 'status', '-v', filename],
         stdout=subprocess.PIPE,
         stderr=subprocess.PIPE,
     )
     (stdout, stderr) = p.communicate()
     if p.returncode != 0 or stderr or not stdout:
         raise ValueError(
-            "Error determining svn_revision for %s: %s"
+            'Error determining svn_revision for %s: %s'
             % (os.path.split(filename)[1], textwrap.fill(stderr))
         )
     return stdout.split()[2]
@@ -2397,7 +2442,7 @@ def _find_collections(root):
     packages = []
     for dirname, subdirs, files in os.walk(root):
         for filename in files:
-            if filename.endswith(".xml"):
+            if filename.endswith('.xml'):
                 xmlfile = os.path.join(dirname, filename)
                 yield ElementTree.parse(xmlfile).getroot()
 
@@ -2417,43 +2462,43 @@ def _find_packages(root):
     # Find all packages.
     packages = []
     for dirname, subdirs, files in os.walk(root):
-        relpath = "/".join(_path_from(root, dirname))
+        relpath = '/'.join(_path_from(root, dirname))
         for filename in files:
-            if filename.endswith(".xml"):
+            if filename.endswith('.xml'):
                 xmlfilename = os.path.join(dirname, filename)
-                zipfilename = xmlfilename[:-4] + ".zip"
+                zipfilename = xmlfilename[:-4] + '.zip'
                 try:
                     zf = zipfile.ZipFile(zipfilename)
                 except Exception as e:
-                    raise ValueError("Error reading file %r!\n%s" % (zipfilename, e))
+                    raise ValueError('Error reading file %r!\n%s' % (zipfilename, e))
                 try:
                     pkg_xml = ElementTree.parse(xmlfilename).getroot()
                 except Exception as e:
-                    raise ValueError("Error reading file %r!\n%s" % (xmlfilename, e))
+                    raise ValueError('Error reading file %r!\n%s' % (xmlfilename, e))
 
                 # Check that the UID matches the filename
                 uid = os.path.split(xmlfilename[:-4])[1]
-                if pkg_xml.get("id") != uid:
+                if pkg_xml.get('id') != uid:
                     raise ValueError(
-                        "package identifier mismatch (%s "
-                        "vs %s)" % (pkg_xml.get("id"), uid)
+                        'package identifier mismatch (%s '
+                        'vs %s)' % (pkg_xml.get('id'), uid)
                     )
 
                 # Check that the zipfile expands to a subdir whose
                 # name matches the uid.
                 if sum(
-                    (name != uid and not name.startswith(uid + "/"))
+                    (name != uid and not name.startswith(uid + '/'))
                     for name in zf.namelist()
                 ):
                     raise ValueError(
-                        "Zipfile %s.zip does not expand to a "
-                        "single subdirectory %s/" % (uid, uid)
+                        'Zipfile %s.zip does not expand to a '
+                        'single subdirectory %s/' % (uid, uid)
                     )
 
                 yield pkg_xml, zf, relpath
         # Don't recurse into svn subdirectories:
         try:
-            subdirs.remove(".svn")
+            subdirs.remove('.svn')
         except ValueError:
             pass
 
@@ -2481,7 +2526,7 @@ def update():
     _downloader.update()
 
 
-if __name__ == "__main__":
+if __name__ == '__main__':
     from optparse import OptionParser
 
     parser = OptionParser()
@@ -2520,7 +2565,7 @@ if __name__ == "__main__":
         "-u",
         "--url",
         dest="server_index_url",
-        default=os.environ.get("NLTK_DOWNLOAD_URL"),
+        default=os.environ.get('NLTK_DOWNLOAD_URL'),
         help="download server index url",
     )
 
index 8e90fd1..f5c6a6e 100644 (file)
@@ -1,6 +1,6 @@
 # Natural Language Toolkit: graphical representations package
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Edward Loper <edloper@gmail.com>
 #         Steven Bird <stevenbird1@gmail.com>
 # URL: <http://nltk.org/>
@@ -8,7 +8,7 @@
 
 # Import Tkinter-based modules if Tkinter is installed
 try:
-    import tkinter
+    from six.moves import tkinter
 except ImportError:
     import warnings
 
index f679312..2beb7b0 100644 (file)
Binary files a/nlp_resource_data/nltk/draw/__pycache__/__init__.cpython-37.pyc and b/nlp_resource_data/nltk/draw/__pycache__/__init__.cpython-37.pyc differ
index 9a77630..376fd86 100644 (file)
Binary files a/nlp_resource_data/nltk/draw/__pycache__/cfg.cpython-37.pyc and b/nlp_resource_data/nltk/draw/__pycache__/cfg.cpython-37.pyc differ
index ee174f3..58927ad 100644 (file)
Binary files a/nlp_resource_data/nltk/draw/__pycache__/dispersion.cpython-37.pyc and b/nlp_resource_data/nltk/draw/__pycache__/dispersion.cpython-37.pyc differ
index 57ffb51..8961ced 100644 (file)
Binary files a/nlp_resource_data/nltk/draw/__pycache__/table.cpython-37.pyc and b/nlp_resource_data/nltk/draw/__pycache__/table.cpython-37.pyc differ
index d62d51d..1236022 100644 (file)
Binary files a/nlp_resource_data/nltk/draw/__pycache__/tree.cpython-37.pyc and b/nlp_resource_data/nltk/draw/__pycache__/tree.cpython-37.pyc differ
index 9aee591..e6cdb22 100644 (file)
Binary files a/nlp_resource_data/nltk/draw/__pycache__/util.cpython-37.pyc and b/nlp_resource_data/nltk/draw/__pycache__/util.cpython-37.pyc differ
index 9cab511..3afb3e4 100644 (file)
@@ -1,6 +1,6 @@
 # Natural Language Toolkit: CFG visualization
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Edward Loper <edloper@gmail.com>
 # URL: <http://nltk.org/>
 # For license information, see LICENSE.TXT
@@ -48,7 +48,8 @@ Visualization tools for CFGs.
 
 import re
 
-from tkinter import (
+from six import string_types
+from six.moves.tkinter import (
     Button,
     Canvas,
     Entry,
@@ -78,24 +79,24 @@ from nltk.draw.util import (
 
 
 class ProductionList(ColorizedList):
-    ARROW = SymbolWidget.SYMBOLS["rightarrow"]
+    ARROW = SymbolWidget.SYMBOLS['rightarrow']
 
     def _init_colortags(self, textwidget, options):
-        textwidget.tag_config("terminal", foreground="#006000")
-        textwidget.tag_config("arrow", font="symbol", underline="0")
+        textwidget.tag_config('terminal', foreground='#006000')
+        textwidget.tag_config('arrow', font='symbol', underline='0')
         textwidget.tag_config(
-            "nonterminal", foreground="blue", font=("helvetica", -12, "bold")
+            'nonterminal', foreground='blue', font=('helvetica', -12, 'bold')
         )
 
     def _item_repr(self, item):
         contents = []
-        contents.append(("%s\t" % item.lhs(), "nonterminal"))
-        contents.append((self.ARROW, "arrow"))
+        contents.append(('%s\t' % item.lhs(), 'nonterminal'))
+        contents.append((self.ARROW, 'arrow'))
         for elt in item.rhs():
             if isinstance(elt, Nonterminal):
-                contents.append((" %s" % elt.symbol(), "nonterminal"))
+                contents.append((' %s' % elt.symbol(), 'nonterminal'))
             else:
-                contents.append((" %r" % elt, "terminal"))
+                contents.append((' %r' % elt, 'terminal'))
         return contents
 
 
@@ -160,7 +161,7 @@ class CFGEditor(object):
 
     # Regular expressions used by _analyze_line.  Precompile them, so
     # we can process the text faster.
-    ARROW = SymbolWidget.SYMBOLS["rightarrow"]
+    ARROW = SymbolWidget.SYMBOLS['rightarrow']
     _LHS_RE = re.compile(r"(^\s*\w+\s*)(->|(" + ARROW + "))")
     _ARROW_RE = re.compile("\s*(->|(" + ARROW + "))\s*")
     _PRODUCTION_RE = re.compile(
@@ -171,14 +172,14 @@ class CFGEditor(object):
         + r"((\w+|'[\w ]*'|\"[\w ]*\"|\|)\s*)*$"  # arrow
     )  # RHS
     _TOKEN_RE = re.compile("\\w+|->|'[\\w ]+'|\"[\\w ]+\"|(" + ARROW + ")")
-    _BOLD = ("helvetica", -12, "bold")
+    _BOLD = ('helvetica', -12, 'bold')
 
     def __init__(self, parent, cfg=None, set_cfg_callback=None):
         self._parent = parent
         if cfg is not None:
             self._cfg = cfg
         else:
-            self._cfg = CFG(Nonterminal("S"), [])
+            self._cfg = CFG(Nonterminal('S'), [])
         self._set_cfg_callback = set_cfg_callback
 
         self._highlight_matching_nonterminals = 1
@@ -188,97 +189,97 @@ class CFGEditor(object):
         self._init_bindings()
 
         self._init_startframe()
-        self._startframe.pack(side="top", fill="x", expand=0)
+        self._startframe.pack(side='top', fill='x', expand=0)
         self._init_prodframe()
-        self._prodframe.pack(side="top", fill="both", expand=1)
+        self._prodframe.pack(side='top', fill='both', expand=1)
         self._init_buttons()
-        self._buttonframe.pack(side="bottom", fill="x", expand=0)
+        self._buttonframe.pack(side='bottom', fill='x', expand=0)
 
         self._textwidget.focus()
 
     def _init_startframe(self):
         frame = self._startframe = Frame(self._top)
         self._start = Entry(frame)
-        self._start.pack(side="right")
-        Label(frame, text="Start Symbol:").pack(side="right")
-        Label(frame, text="Productions:").pack(side="left")
+        self._start.pack(side='right')
+        Label(frame, text='Start Symbol:').pack(side='right')
+        Label(frame, text='Productions:').pack(side='left')
         self._start.insert(0, self._cfg.start().symbol())
 
     def _init_buttons(self):
         frame = self._buttonframe = Frame(self._top)
-        Button(frame, text="Ok", command=self._ok, underline=0, takefocus=0).pack(
-            side="left"
+        Button(frame, text='Ok', command=self._ok, underline=0, takefocus=0).pack(
+            side='left'
         )
-        Button(frame, text="Apply", command=self._apply, underline=0, takefocus=0).pack(
-            side="left"
+        Button(frame, text='Apply', command=self._apply, underline=0, takefocus=0).pack(
+            side='left'
         )
-        Button(frame, text="Reset", command=self._reset, underline=0, takefocus=0).pack(
-            side="left"
+        Button(frame, text='Reset', command=self._reset, underline=0, takefocus=0).pack(
+            side='left'
         )
         Button(
-            frame, text="Cancel", command=self._cancel, underline=0, takefocus=0
-        ).pack(side="left")
-        Button(frame, text="Help", command=self._help, underline=0, takefocus=0).pack(
-            side="right"
+            frame, text='Cancel', command=self._cancel, underline=0, takefocus=0
+        ).pack(side='left')
+        Button(frame, text='Help', command=self._help, underline=0, takefocus=0).pack(
+            side='right'
         )
 
     def _init_bindings(self):
-        self._top.title("CFG Editor")
-        self._top.bind("<Control-q>", self._cancel)
-        self._top.bind("<Alt-q>", self._cancel)
-        self._top.bind("<Control-d>", self._cancel)
+        self._top.title('CFG Editor')
+        self._top.bind('<Control-q>', self._cancel)
+        self._top.bind('<Alt-q>', self._cancel)
+        self._top.bind('<Control-d>', self._cancel)
         # self._top.bind('<Control-x>', self._cancel)
-        self._top.bind("<Alt-x>", self._cancel)
-        self._top.bind("<Escape>", self._cancel)
+        self._top.bind('<Alt-x>', self._cancel)
+        self._top.bind('<Escape>', self._cancel)
         # self._top.bind('<Control-c>', self._cancel)
-        self._top.bind("<Alt-c>", self._cancel)
-
-        self._top.bind("<Control-o>", self._ok)
-        self._top.bind("<Alt-o>", self._ok)
-        self._top.bind("<Control-a>", self._apply)
-        self._top.bind("<Alt-a>", self._apply)
-        self._top.bind("<Control-r>", self._reset)
-        self._top.bind("<Alt-r>", self._reset)
-        self._top.bind("<Control-h>", self._help)
-        self._top.bind("<Alt-h>", self._help)
-        self._top.bind("<F1>", self._help)
+        self._top.bind('<Alt-c>', self._cancel)
+
+        self._top.bind('<Control-o>', self._ok)
+        self._top.bind('<Alt-o>', self._ok)
+        self._top.bind('<Control-a>', self._apply)
+        self._top.bind('<Alt-a>', self._apply)
+        self._top.bind('<Control-r>', self._reset)
+        self._top.bind('<Alt-r>', self._reset)
+        self._top.bind('<Control-h>', self._help)
+        self._top.bind('<Alt-h>', self._help)
+        self._top.bind('<F1>', self._help)
 
     def _init_prodframe(self):
         self._prodframe = Frame(self._top)
 
         # Create the basic Text widget & scrollbar.
         self._textwidget = Text(
-            self._prodframe, background="#e0e0e0", exportselection=1
+            self._prodframe, background='#e0e0e0', exportselection=1
         )
-        self._textscroll = Scrollbar(self._prodframe, takefocus=0, orient="vertical")
+        self._textscroll = Scrollbar(self._prodframe, takefocus=0, orient='vertical')
         self._textwidget.config(yscrollcommand=self._textscroll.set)
         self._textscroll.config(command=self._textwidget.yview)
-        self._textscroll.pack(side="right", fill="y")
-        self._textwidget.pack(expand=1, fill="both", side="left")
+        self._textscroll.pack(side='right', fill='y')
+        self._textwidget.pack(expand=1, fill='both', side='left')
 
         # Initialize the colorization tags.  Each nonterminal gets its
         # own tag, so they aren't listed here.
-        self._textwidget.tag_config("terminal", foreground="#006000")
-        self._textwidget.tag_config("arrow", font="symbol")
-        self._textwidget.tag_config("error", background="red")
+        self._textwidget.tag_config('terminal', foreground='#006000')
+        self._textwidget.tag_config('arrow', font='symbol')
+        self._textwidget.tag_config('error', background='red')
 
         # Keep track of what line they're on.  We use that to remember
         # to re-analyze a line whenever they leave it.
         self._linenum = 0
 
         # Expand "->" to an arrow.
-        self._top.bind(">", self._replace_arrows)
+        self._top.bind('>', self._replace_arrows)
 
         # Re-colorize lines when appropriate.
-        self._top.bind("<<Paste>>", self._analyze)
-        self._top.bind("<KeyPress>", self._check_analyze)
-        self._top.bind("<ButtonPress>", self._check_analyze)
+        self._top.bind('<<Paste>>', self._analyze)
+        self._top.bind('<KeyPress>', self._check_analyze)
+        self._top.bind('<ButtonPress>', self._check_analyze)
 
         # Tab cycles focus. (why doesn't this work??)
         def cycle(e, textwidget=self._textwidget):
             textwidget.tk_focusNext().focus()
 
-        self._textwidget.bind("<Tab>", cycle)
+        self._textwidget.bind('<Tab>', cycle)
 
         prod_tuples = [(p.lhs(), [p.rhs()]) for p in self._cfg.productions()]
         for i in range(len(prod_tuples) - 1, 0, -1):
@@ -294,16 +295,16 @@ class CFGEditor(object):
 
         for lhs, rhss in prod_tuples:
             print(lhs, rhss)
-            s = "%s ->" % lhs
+            s = '%s ->' % lhs
             for rhs in rhss:
                 for elt in rhs:
                     if isinstance(elt, Nonterminal):
-                        s += " %s" % elt
+                        s += ' %s' % elt
                     else:
-                        s += " %r" % elt
-                s += " |"
-            s = s[:-2] + "\n"
-            self._textwidget.insert("end", s)
+                        s += ' %r' % elt
+                s += ' |'
+            s = s[:-2] + '\n'
+            self._textwidget.insert('end', s)
 
         self._analyze()
 
@@ -337,10 +338,10 @@ class CFGEditor(object):
         Remove all tags (except ``arrow`` and ``sel``) from the given
         line of the text widget used for editing the productions.
         """
-        start = "%d.0" % linenum
-        end = "%d.end" % linenum
+        start = '%d.0' % linenum
+        end = '%d.end' % linenum
         for tag in self._textwidget.tag_names():
-            if tag not in ("arrow", "sel"):
+            if tag not in ('arrow', 'sel'):
                 self._textwidget.tag_remove(tag, start, end)
 
     def _check_analyze(self, *e):
@@ -349,7 +350,7 @@ class CFGEditor(object):
         all colorization from the line we moved to, and re-colorize
         the line that we moved from.
         """
-        linenum = int(self._textwidget.index("insert").split(".")[0])
+        linenum = int(self._textwidget.index('insert').split('.')[0])
         if linenum != self._linenum:
             self._clear_tags(linenum)
             self._analyze_line(self._linenum)
@@ -361,21 +362,21 @@ class CFGEditor(object):
         symbol font).  This searches the whole buffer, but is fast
         enough to be done anytime they press '>'.
         """
-        arrow = "1.0"
+        arrow = '1.0'
         while True:
-            arrow = self._textwidget.search("->", arrow, "end+1char")
-            if arrow == "":
+            arrow = self._textwidget.search('->', arrow, 'end+1char')
+            if arrow == '':
                 break
-            self._textwidget.delete(arrow, arrow + "+2char")
-            self._textwidget.insert(arrow, self.ARROW, "arrow")
-            self._textwidget.insert(arrow, "\t")
+            self._textwidget.delete(arrow, arrow + '+2char')
+            self._textwidget.insert(arrow, self.ARROW, 'arrow')
+            self._textwidget.insert(arrow, '\t')
 
-        arrow = "1.0"
+        arrow = '1.0'
         while True:
-            arrow = self._textwidget.search(self.ARROW, arrow + "+1char", "end+1char")
-            if arrow == "":
+            arrow = self._textwidget.search(self.ARROW, arrow + '+1char', 'end+1char')
+            if arrow == '':
                 break
-            self._textwidget.tag_add("arrow", arrow, arrow + "+1char")
+            self._textwidget.tag_add('arrow', arrow, arrow + '+1char')
 
     def _analyze_token(self, match, linenum):
         """
@@ -386,34 +387,34 @@ class CFGEditor(object):
         """
         # What type of token is it?
         if match.group()[0] in "'\"":
-            tag = "terminal"
-        elif match.group() in ("->", self.ARROW):
-            tag = "arrow"
+            tag = 'terminal'
+        elif match.group() in ('->', self.ARROW):
+            tag = 'arrow'
         else:
             # If it's a nonterminal, then set up new bindings, so we
             # can highlight all instances of that nonterminal when we
             # put the mouse over it.
-            tag = "nonterminal_" + match.group()
+            tag = 'nonterminal_' + match.group()
             if tag not in self._textwidget.tag_names():
                 self._init_nonterminal_tag(tag)
 
-        start = "%d.%d" % (linenum, match.start())
-        end = "%d.%d" % (linenum, match.end())
+        start = '%d.%d' % (linenum, match.start())
+        end = '%d.%d' % (linenum, match.end())
         self._textwidget.tag_add(tag, start, end)
 
-    def _init_nonterminal_tag(self, tag, foreground="blue"):
+    def _init_nonterminal_tag(self, tag, foreground='blue'):
         self._textwidget.tag_config(tag, foreground=foreground, font=CFGEditor._BOLD)
         if not self._highlight_matching_nonterminals:
             return
 
         def enter(e, textwidget=self._textwidget, tag=tag):
-            textwidget.tag_config(tag, background="#80ff80")
+            textwidget.tag_config(tag, background='#80ff80')
 
         def leave(e, textwidget=self._textwidget, tag=tag):
-            textwidget.tag_config(tag, background="")
+            textwidget.tag_config(tag, background='')
 
-        self._textwidget.tag_bind(tag, "<Enter>", enter)
-        self._textwidget.tag_bind(tag, "<Leave>", leave)
+        self._textwidget.tag_bind(tag, '<Enter>', enter)
+        self._textwidget.tag_bind(tag, '<Leave>', leave)
 
     def _analyze_line(self, linenum):
         """
@@ -423,7 +424,7 @@ class CFGEditor(object):
         self._clear_tags(linenum)
 
         # Get the line line's text string.
-        line = self._textwidget.get(repr(linenum) + ".0", repr(linenum) + ".end")
+        line = self._textwidget.get(repr(linenum) + '.0', repr(linenum) + '.end')
 
         # If it's a valid production, then colorize each token.
         if CFGEditor._PRODUCTION_RE.match(line):
@@ -431,10 +432,10 @@ class CFGEditor(object):
             # and call analyze_token on each token.
             def analyze_token(match, self=self, linenum=linenum):
                 self._analyze_token(match, linenum)
-                return ""
+                return ''
 
             CFGEditor._TOKEN_RE.sub(analyze_token, line)
-        elif line.strip() != "":
+        elif line.strip() != '':
             # It's invalid; show the user where the error is.
             self._mark_error(linenum, line)
 
@@ -445,29 +446,29 @@ class CFGEditor(object):
         arrowmatch = CFGEditor._ARROW_RE.search(line)
         if not arrowmatch:
             # If there's no arrow at all, highlight the whole line.
-            start = "%d.0" % linenum
-            end = "%d.end" % linenum
+            start = '%d.0' % linenum
+            end = '%d.end' % linenum
         elif not CFGEditor._LHS_RE.match(line):
             # Otherwise, if the LHS is bad, highlight it.
-            start = "%d.0" % linenum
-            end = "%d.%d" % (linenum, arrowmatch.start())
+            start = '%d.0' % linenum
+            end = '%d.%d' % (linenum, arrowmatch.start())
         else:
             # Otherwise, highlight the RHS.
-            start = "%d.%d" % (linenum, arrowmatch.end())
-            end = "%d.end" % linenum
+            start = '%d.%d' % (linenum, arrowmatch.end())
+            end = '%d.end' % linenum
 
         # If we're highlighting 0 chars, highlight the whole line.
-        if self._textwidget.compare(start, "==", end):
-            start = "%d.0" % linenum
-            end = "%d.end" % linenum
-        self._textwidget.tag_add("error", start, end)
+        if self._textwidget.compare(start, '==', end):
+            start = '%d.0' % linenum
+            end = '%d.end' % linenum
+        self._textwidget.tag_add('error', start, end)
 
     def _analyze(self, *e):
         """
         Replace ``->`` with arrows, and colorize the entire buffer.
         """
         self._replace_arrows()
-        numlines = int(self._textwidget.index("end").split(".")[0])
+        numlines = int(self._textwidget.index('end').split('.')[0])
         for linenum in range(1, numlines + 1):  # line numbers start at 1.
             self._analyze_line(linenum)
 
@@ -479,15 +480,15 @@ class CFGEditor(object):
         productions = []
 
         # Get the text, normalize it, and split it into lines.
-        text = self._textwidget.get("1.0", "end")
-        text = re.sub(self.ARROW, "->", text)
-        text = re.sub("\t", " ", text)
-        lines = text.split("\n")
+        text = self._textwidget.get('1.0', 'end')
+        text = re.sub(self.ARROW, '->', text)
+        text = re.sub('\t', ' ', text)
+        lines = text.split('\n')
 
         # Convert each line to a CFG production
         for line in lines:
             line = line.strip()
-            if line == "":
+            if line == '':
                 continue
             productions += _read_cfg_production(line)
             # if line.strip() == '': continue
@@ -526,9 +527,9 @@ class CFGEditor(object):
             self._set_cfg_callback(cfg)
 
     def _reset(self, *e):
-        self._textwidget.delete("1.0", "end")
+        self._textwidget.delete('1.0', 'end')
         for production in self._cfg.productions():
-            self._textwidget.insert("end", "%s\n" % production)
+            self._textwidget.insert('end', '%s\n' % production)
         self._analyze()
         if self._set_cfg_callback is not None:
             self._set_cfg_callback(self._cfg)
@@ -545,15 +546,15 @@ class CFGEditor(object):
         try:
             ShowText(
                 self._parent,
-                "Help: Chart Parser Demo",
+                'Help: Chart Parser Demo',
                 (_CFGEditor_HELP).strip(),
                 width=75,
-                font="fixed",
+                font='fixed',
             )
         except:
             ShowText(
                 self._parent,
-                "Help: Chart Parser Demo",
+                'Help: Chart Parser Demo',
                 (_CFGEditor_HELP).strip(),
                 width=75,
             )
@@ -571,7 +572,7 @@ class CFGDemo(object):
 
         # Set up the main window.
         self._top = Tk()
-        self._top.title("Context Free Grammar Demo")
+        self._top.title('Context Free Grammar Demo')
 
         # Base font size
         self._size = IntVar(self._top)
@@ -582,7 +583,7 @@ class CFGDemo(object):
 
         # Create the basic frames
         frame1 = Frame(self._top)
-        frame1.pack(side="left", fill="y", expand=0)
+        frame1.pack(side='left', fill='y', expand=0)
         self._init_menubar(self._top)
         self._init_buttons(self._top)
         self._init_grammar(frame1)
@@ -594,7 +595,7 @@ class CFGDemo(object):
     # //////////////////////////////////////////////////
 
     def _init_bindings(self, top):
-        top.bind("<Control-q>", self.destroy)
+        top.bind('<Control-q>', self.destroy)
 
     def _init_menubar(self, parent):
         pass
@@ -604,19 +605,19 @@ class CFGDemo(object):
 
     def _init_grammar(self, parent):
         self._prodlist = ProductionList(parent, self._grammar, width=20)
-        self._prodlist.pack(side="top", fill="both", expand=1)
+        self._prodlist.pack(side='top', fill='both', expand=1)
         self._prodlist.focus()
-        self._prodlist.add_callback("select", self._selectprod_cb)
-        self._prodlist.add_callback("move", self._selectprod_cb)
+        self._prodlist.add_callback('select', self._selectprod_cb)
+        self._prodlist.add_callback('move', self._selectprod_cb)
 
     def _init_treelet(self, parent):
-        self._treelet_canvas = Canvas(parent, background="white")
-        self._treelet_canvas.pack(side="bottom", fill="x")
+        self._treelet_canvas = Canvas(parent, background='white')
+        self._treelet_canvas.pack(side='bottom', fill='x')
         self._treelet = None
 
     def _init_workspace(self, parent):
-        self._workspace = CanvasFrame(parent, background="white")
-        self._workspace.pack(side="right", fill="both", expand=1)
+        self._workspace = CanvasFrame(parent, background='white')
+        self._workspace.pack(side='right', fill='both', expand=1)
         self._tree = None
         self.reset_workspace()
 
@@ -627,8 +628,8 @@ class CFGDemo(object):
     def reset_workspace(self):
         c = self._workspace.canvas()
         fontsize = int(self._size.get())
-        node_font = ("helvetica", -(fontsize + 4), "bold")
-        leaf_font = ("helvetica", -(fontsize + 2))
+        node_font = ('helvetica', -(fontsize + 4), 'bold')
+        leaf_font = ('helvetica', -(fontsize + 2))
 
         # Remove the old tree
         if self._tree is not None:
@@ -644,7 +645,7 @@ class CFGDemo(object):
             leaves.append(TextWidget(c, word, font=leaf_font, draggable=1))
 
         # Put it all together into one tree
-        self._tree = TreeSegmentWidget(c, rootnode, leaves, color="white")
+        self._tree = TreeSegmentWidget(c, rootnode, leaves, color='white')
 
         # Add it to the workspace.
         self._workspace.add_widget(self._tree)
@@ -663,7 +664,7 @@ class CFGDemo(object):
         if tree is None:
             tree = self._tree
         for i in range(len(tree.subtrees()) - len(prod.rhs())):
-            if tree["color", i] == "white":
+            if tree['color', i] == 'white':
                 self._markproduction  # FIXME: Is this necessary at all?
 
             for j, node in enumerate(prod.rhs()):
@@ -675,7 +676,7 @@ class CFGDemo(object):
                 ):
                     pass  # matching nonterminal
                 elif (
-                    isinstance(node, str)
+                    isinstance(node, string_types)
                     and isinstance(widget, TextWidget)
                     and node == widget.text()
                 ):
@@ -684,7 +685,7 @@ class CFGDemo(object):
                     break
             else:
                 # Everything matched!
-                print("MATCH AT", i)
+                print('MATCH AT', i)
 
     # //////////////////////////////////////////////////
     # Grammar
@@ -706,16 +707,16 @@ class CFGDemo(object):
 
         # Draw the tree in the treelet area.
         fontsize = int(self._size.get())
-        node_font = ("helvetica", -(fontsize + 4), "bold")
-        leaf_font = ("helvetica", -(fontsize + 2))
+        node_font = ('helvetica', -(fontsize + 4), 'bold')
+        leaf_font = ('helvetica', -(fontsize + 2))
         self._treelet = tree_to_treesegment(
             canvas, tree, node_font=node_font, leaf_font=leaf_font
         )
-        self._treelet["draggable"] = 1
+        self._treelet['draggable'] = 1
 
         # Center the treelet.
         (x1, y1, x2, y2) = self._treelet.bbox()
-        w, h = int(canvas["width"]), int(canvas["height"])
+        w, h = int(canvas['width']), int(canvas['height'])
         self._treelet.move((w - x1 - x2) / 2, (h - y1 - y2) / 2)
 
         # Mark the places where we can add it to the workspace.
@@ -731,7 +732,7 @@ class CFGDemo(object):
 def demo2():
     from nltk import Nonterminal, Production, CFG
 
-    nonterminals = "S VP NP PP P N Name V Det"
+    nonterminals = 'S VP NP PP P N Name V Det'
     (S, VP, NP, PP, P, N, Name, V, Det) = [Nonterminal(s) for s in nonterminals.split()]
     productions = (
         # Syntactic Productions
@@ -743,23 +744,23 @@ def demo2():
         Production(VP, [V, NP]),
         Production(PP, [P, NP]),
         Production(PP, []),
-        Production(PP, ["up", "over", NP]),
+        Production(PP, ['up', 'over', NP]),
         # Lexical Productions
-        Production(NP, ["I"]),
-        Production(Det, ["the"]),
-        Production(Det, ["a"]),
-        Production(N, ["man"]),
-        Production(V, ["saw"]),
-        Production(P, ["in"]),
-        Production(P, ["with"]),
-        Production(N, ["park"]),
-        Production(N, ["dog"]),
-        Production(N, ["statue"]),
-        Production(Det, ["my"]),
+        Production(NP, ['I']),
+        Production(Det, ['the']),
+        Production(Det, ['a']),
+        Production(N, ['man']),
+        Production(V, ['saw']),
+        Production(P, ['in']),
+        Production(P, ['with']),
+        Production(N, ['park']),
+        Production(N, ['dog']),
+        Production(N, ['statue']),
+        Production(Det, ['my']),
     )
     grammar = CFG(S, productions)
 
-    text = "I saw a man in the park".split()
+    text = 'I saw a man in the park'.split()
     d = CFGDemo(grammar, text)
     d.mainloop()
 
@@ -772,7 +773,7 @@ def demo2():
 def demo():
     from nltk import Nonterminal, CFG
 
-    nonterminals = "S VP NP PP P N Name V Det"
+    nonterminals = 'S VP NP PP P N Name V Det'
     (S, VP, NP, PP, P, N, Name, V, Det) = [Nonterminal(s) for s in nonterminals.split()]
 
     grammar = CFG.fromstring(
@@ -804,8 +805,8 @@ def demo():
 
     top = Tk()
     editor = CFGEditor(top, grammar, cb)
-    Label(top, text="\nTesting CFG Editor\n").pack()
-    Button(top, text="Quit", command=top.destroy).pack()
+    Label(top, text='\nTesting CFG Editor\n').pack()
+    Button(top, text='Quit', command=top.destroy).pack()
     top.mainloop()
 
 
@@ -813,7 +814,7 @@ def demo3():
     from nltk import Production
 
     (S, VP, NP, PP, P, N, Name, V, Det) = nonterminals(
-        "S, VP, NP, PP, P, N, Name, V, Det"
+        'S, VP, NP, PP, P, N, Name, V, Det'
     )
 
     productions = (
@@ -826,19 +827,19 @@ def demo3():
         Production(VP, [V, NP]),
         Production(PP, [P, NP]),
         Production(PP, []),
-        Production(PP, ["up", "over", NP]),
+        Production(PP, ['up', 'over', NP]),
         # Lexical Productions
-        Production(NP, ["I"]),
-        Production(Det, ["the"]),
-        Production(Det, ["a"]),
-        Production(N, ["man"]),
-        Production(V, ["saw"]),
-        Production(P, ["in"]),
-        Production(P, ["with"]),
-        Production(N, ["park"]),
-        Production(N, ["dog"]),
-        Production(N, ["statue"]),
-        Production(Det, ["my"]),
+        Production(NP, ['I']),
+        Production(Det, ['the']),
+        Production(Det, ['a']),
+        Production(N, ['man']),
+        Production(V, ['saw']),
+        Production(P, ['in']),
+        Production(P, ['with']),
+        Production(N, ['park']),
+        Production(N, ['dog']),
+        Production(N, ['statue']),
+        Production(Det, ['my']),
     )
 
     t = Tk()
@@ -846,15 +847,15 @@ def demo3():
     def destroy(e, t=t):
         t.destroy()
 
-    t.bind("q", destroy)
+    t.bind('q', destroy)
     p = ProductionList(t, productions)
-    p.pack(expand=1, fill="both")
-    p.add_callback("select", p.markonly)
-    p.add_callback("move", p.markonly)
+    p.pack(expand=1, fill='both')
+    p.add_callback('select', p.markonly)
+    p.add_callback('move', p.markonly)
     p.focus()
     p.mark(productions[2])
     p.mark(productions[8])
 
 
-if __name__ == "__main__":
+if __name__ == '__main__':
     demo()
index d0717af..40b2a9a 100644 (file)
@@ -1,6 +1,6 @@
 # Natural Language Toolkit: Dispersion Plots
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Steven Bird <stevenbird1@gmail.com>
 # URL: <http://nltk.org/>
 # For license information, see LICENSE.TXT
@@ -26,8 +26,8 @@ def dispersion_plot(text, words, ignore_case=False, title="Lexical Dispersion Pl
         from matplotlib import pylab
     except ImportError:
         raise ValueError(
-            "The plot function requires matplotlib to be installed."
-            "See http://matplotlib.org/"
+            'The plot function requires matplotlib to be installed.'
+            'See http://matplotlib.org/'
         )
 
     text = list(text)
@@ -58,8 +58,9 @@ def dispersion_plot(text, words, ignore_case=False, title="Lexical Dispersion Pl
     pylab.show()
 
 
-if __name__ == "__main__":
+if __name__ == '__main__':
+    import nltk.compat
     from nltk.corpus import gutenberg
 
-    words = ["Elinor", "Marianne", "Edward", "Willoughby"]
-    dispersion_plot(gutenberg.words("austen-sense.txt"), words)
+    words = ['Elinor', 'Marianne', 'Edward', 'Willoughby']
+    dispersion_plot(gutenberg.words('austen-sense.txt'), words)
index 7ca4a2d..aea70b4 100644 (file)
@@ -1,6 +1,6 @@
 # Natural Language Toolkit: Table widget
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Edward Loper <edloper@gmail.com>
 # URL: <http://nltk.org/>
 # For license information, see LICENSE.TXT
@@ -9,9 +9,12 @@
 Tkinter widgets for displaying multi-column listboxes and tables.
 """
 
+from __future__ import division
+
+
 import operator
 
-from tkinter import Frame, Label, Listbox, Scrollbar, Tk
+from six.moves.tkinter import Frame, Label, Listbox, Scrollbar, Tk
 
 
 ######################################################################
@@ -35,15 +38,15 @@ class MultiListbox(Frame):
     # /////////////////////////////////////////////////////////////////
 
     #: Default configuration values for the frame.
-    FRAME_CONFIG = dict(background="#888", takefocus=True, highlightthickness=1)
+    FRAME_CONFIG = dict(background='#888', takefocus=True, highlightthickness=1)
 
     #: Default configurations for the column labels.
     LABEL_CONFIG = dict(
         borderwidth=1,
-        relief="raised",
-        font="helvetica -16 bold",
-        background="#444",
-        foreground="white",
+        relief='raised',
+        font='helvetica -16 bold',
+        background='#444',
+        foreground='white',
     )
 
     #: Default configuration for the column listboxes.
@@ -52,8 +55,8 @@ class MultiListbox(Frame):
         selectborderwidth=0,
         highlightthickness=0,
         exportselection=False,
-        selectbackground="#888",
-        activestyle="none",
+        selectbackground='#888',
+        activestyle='none',
         takefocus=False,
     )
 
@@ -100,7 +103,7 @@ class MultiListbox(Frame):
         if column_weights is None:
             column_weights = [1] * len(columns)
         elif len(column_weights) != len(columns):
-            raise ValueError("Expected one column_weight for each column")
+            raise ValueError('Expected one column_weight for each column')
         self._column_weights = column_weights
 
         # Configure our widgets.
@@ -113,40 +116,40 @@ class MultiListbox(Frame):
             if include_labels:
                 l = Label(self, text=label, **self.LABEL_CONFIG)
                 self._labels.append(l)
-                l.grid(column=i, row=0, sticky="news", padx=0, pady=0)
+                l.grid(column=i, row=0, sticky='news', padx=0, pady=0)
                 l.column_index = i
 
             # Create a listbox for the column
             lb = Listbox(self, **self.LISTBOX_CONFIG)
             self._listboxes.append(lb)
-            lb.grid(column=i, row=1, sticky="news", padx=0, pady=0)
+            lb.grid(column=i, row=1, sticky='news', padx=0, pady=0)
             lb.column_index = i
 
             # Clicking or dragging selects:
-            lb.bind("<Button-1>", self._select)
-            lb.bind("<B1-Motion>", self._select)
+            lb.bind('<Button-1>', self._select)
+            lb.bind('<B1-Motion>', self._select)
             # Scroll whell scrolls:
-            lb.bind("<Button-4>", lambda e: self._scroll(-1))
-            lb.bind("<Button-5>", lambda e: self._scroll(+1))
-            lb.bind("<MouseWheel>", lambda e: self._scroll(e.delta))
+            lb.bind('<Button-4>', lambda e: self._scroll(-1))
+            lb.bind('<Button-5>', lambda e: self._scroll(+1))
+            lb.bind('<MouseWheel>', lambda e: self._scroll(e.delta))
             # Button 2 can be used to scan:
-            lb.bind("<Button-2>", lambda e: self.scan_mark(e.x, e.y))
-            lb.bind("<B2-Motion>", lambda e: self.scan_dragto(e.x, e.y))
+            lb.bind('<Button-2>', lambda e: self.scan_mark(e.x, e.y))
+            lb.bind('<B2-Motion>', lambda e: self.scan_dragto(e.x, e.y))
             # Dragging outside the window has no effect (diable
             # the default listbox behavior, which scrolls):
-            lb.bind("<B1-Leave>", lambda e: "break")
+            lb.bind('<B1-Leave>', lambda e: 'break')
             # Columns can be resized by dragging them:
-            l.bind("<Button-1>", self._resize_column)
+            l.bind('<Button-1>', self._resize_column)
 
         # Columns can be resized by dragging them.  (This binding is
         # used if they click on the grid between columns:)
-        self.bind("<Button-1>", self._resize_column)
+        self.bind('<Button-1>', self._resize_column)
 
         # Set up key bindings for the widget:
-        self.bind("<Up>", lambda e: self.select(delta=-1))
-        self.bind("<Down>", lambda e: self.select(delta=1))
-        self.bind("<Prior>", lambda e: self.select(delta=-self._pagesize()))
-        self.bind("<Next>", lambda e: self.select(delta=self._pagesize()))
+        self.bind('<Up>', lambda e: self.select(delta=-1))
+        self.bind('<Down>', lambda e: self.select(delta=1))
+        self.bind('<Prior>', lambda e: self.select(delta=-self._pagesize()))
+        self.bind('<Next>', lambda e: self.select(delta=self._pagesize()))
 
         # Configuration customizations
         self.configure(cnf, **kw)
@@ -164,7 +167,7 @@ class MultiListbox(Frame):
         """
         # If we're already waiting for a button release, then ignore
         # the new button press.
-        if event.widget.bind("<ButtonRelease>"):
+        if event.widget.bind('<ButtonRelease>'):
             return False
 
         # Decide which column (if any) to resize.
@@ -180,9 +183,9 @@ class MultiListbox(Frame):
 
         # Bind callbacks that are used to resize it.
         if self._resize_column_index is not None:
-            event.widget.bind("<Motion>", self._resize_column_motion_cb)
+            event.widget.bind('<Motion>', self._resize_column_motion_cb)
             event.widget.bind(
-                "<ButtonRelease-%d>" % event.num, self._resize_column_buttonrelease_cb
+                '<ButtonRelease-%d>' % event.num, self._resize_column_buttonrelease_cb
             )
             return True
         else:
@@ -190,16 +193,16 @@ class MultiListbox(Frame):
 
     def _resize_column_motion_cb(self, event):
         lb = self._listboxes[self._resize_column_index]
-        charwidth = lb.winfo_width() / lb["width"]
+        charwidth = lb.winfo_width() / lb['width']
 
         x1 = event.x + event.widget.winfo_x()
         x2 = lb.winfo_x() + lb.winfo_width()
 
-        lb["width"] = max(3, lb["width"] + (x1 - x2) // charwidth)
+        lb['width'] = max(3, lb['width'] + (x1 - x2) // charwidth)
 
     def _resize_column_buttonrelease_cb(self, event):
-        event.widget.unbind("<ButtonRelease-%d>" % event.num)
-        event.widget.unbind("<Motion>")
+        event.widget.unbind('<ButtonRelease-%d>' % event.num)
+        event.widget.unbind('<Motion>')
 
     # /////////////////////////////////////////////////////////////////
     # Properties
@@ -243,19 +246,19 @@ class MultiListbox(Frame):
 
     def _select(self, e):
         i = e.widget.nearest(e.y)
-        self.selection_clear(0, "end")
+        self.selection_clear(0, 'end')
         self.selection_set(i)
         self.activate(i)
         self.focus()
 
     def _scroll(self, delta):
         for lb in self._listboxes:
-            lb.yview_scroll(delta, "unit")
-        return "break"
+            lb.yview_scroll(delta, 'unit')
+        return 'break'
 
     def _pagesize(self):
         """:return: The number of rows that makes up one page"""
-        return int(self.index("@0,1000000")) - int(self.index("@0,0"))
+        return int(self.index('@0,1000000')) - int(self.index('@0,0'))
 
     # /////////////////////////////////////////////////////////////////
     # Row selection
@@ -273,7 +276,7 @@ class MultiListbox(Frame):
             selected index, to ensure that it is visible.
         """
         if (index is not None) and (delta is not None):
-            raise ValueError("specify index or delta, but not both")
+            raise ValueError('specify index or delta, but not both')
 
         # If delta was given, then calculate index.
         if delta is not None:
@@ -283,7 +286,7 @@ class MultiListbox(Frame):
                 index = int(self.curselection()[0]) + delta
 
         # Clear all selected rows.
-        self.selection_clear(0, "end")
+        self.selection_clear(0, 'end')
 
         # Select the specified index
         if index is not None:
@@ -308,10 +311,10 @@ class MultiListbox(Frame):
         """
         cnf = dict(list(cnf.items()) + list(kw.items()))
         for (key, val) in list(cnf.items()):
-            if key.startswith("label_") or key.startswith("label-"):
+            if key.startswith('label_') or key.startswith('label-'):
                 for label in self._labels:
                     label.configure({key[6:]: val})
-            elif key.startswith("listbox_") or key.startswith("listbox-"):
+            elif key.startswith('listbox_') or key.startswith('listbox-'):
                 for listbox in self._listboxes:
                     listbox.configure({key[8:]: val})
             else:
@@ -344,12 +347,12 @@ class MultiListbox(Frame):
         cnf = dict(list(cnf.items()) + list(kw.items()))
         for (key, val) in list(cnf.items()):
             if key in (
-                "background",
-                "bg",
-                "foreground",
-                "fg",
-                "selectbackground",
-                "selectforeground",
+                'background',
+                'bg',
+                'foreground',
+                'fg',
+                'selectbackground',
+                'selectforeground',
             ):
                 for i in range(lb.size()):
                     lb.itemconfigure(i, {key: val})
@@ -380,8 +383,8 @@ class MultiListbox(Frame):
         for elt in rows:
             if len(elt) != len(self._column_names):
                 raise ValueError(
-                    "rows should be tuples whose length "
-                    "is equal to the number of columns"
+                    'rows should be tuples whose length '
+                    'is equal to the number of columns'
                 )
         for (lb, elts) in zip(self._listboxes, list(zip(*rows))):
             lb.insert(index, *elts)
@@ -435,10 +438,10 @@ class MultiListbox(Frame):
         weight = self._column_weights[col_index]
         if self._labels:
             self._labels[col_index].grid(
-                column=col_index, row=0, sticky="news", padx=0, pady=0
+                column=col_index, row=0, sticky='news', padx=0, pady=0
             )
         self._listboxes[col_index].grid(
-            column=col_index, row=1, sticky="news", padx=0, pady=0
+            column=col_index, row=1, sticky='news', padx=0, pady=0
         )
         self.grid_columnconfigure(col_index, weight=weight)
 
@@ -677,22 +680,22 @@ class Table(object):
 
         # Create our multi-list box.
         self._mlb = MultiListbox(self._frame, column_names, column_weights, cnf, **kw)
-        self._mlb.pack(side="left", expand=True, fill="both")
+        self._mlb.pack(side='left', expand=True, fill='both')
 
         # Optional scrollbar
         if scrollbar:
-            sb = Scrollbar(self._frame, orient="vertical", command=self._mlb.yview)
-            self._mlb.listboxes[0]["yscrollcommand"] = sb.set
+            sb = Scrollbar(self._frame, orient='vertical', command=self._mlb.yview)
+            self._mlb.listboxes[0]['yscrollcommand'] = sb.set
             # for listbox in self._mlb.listboxes:
             #    listbox['yscrollcommand'] = sb.set
-            sb.pack(side="right", fill="y")
+            sb.pack(side='right', fill='y')
             self._scrollbar = sb
 
         # Set up sorting
         self._sortkey = None
         if click_to_sort:
             for i, l in enumerate(self._mlb.column_labels):
-                l.bind("<Button-1>", self._sort)
+                l.bind('<Button-1>', self._sort)
 
         # Fill in our multi-list box.
         self._fill_table()
@@ -804,7 +807,7 @@ class Table(object):
         Delete all rows in this table.
         """
         self._rows = []
-        self._mlb.delete(0, "end")
+        self._mlb.delete(0, 'end')
         if self._DEBUG:
             self._check_table_vs_mlb()
 
@@ -818,7 +821,7 @@ class Table(object):
         ``i``th row and the ``j``th column.
         """
         if isinstance(index, slice):
-            raise ValueError("Slicing not supported")
+            raise ValueError('Slicing not supported')
         elif isinstance(index, tuple) and len(index) == 2:
             return self._rows[index[0]][self.column_index(index[1])]
         else:
@@ -839,7 +842,7 @@ class Table(object):
         ``val``.
         """
         if isinstance(index, slice):
-            raise ValueError("Slicing not supported")
+            raise ValueError('Slicing not supported')
 
         # table[i,j] = val
         elif isinstance(index, tuple) and len(index) == 2:
@@ -868,9 +871,9 @@ class Table(object):
         Delete the ``row_index``th row from this table.
         """
         if isinstance(row_index, slice):
-            raise ValueError("Slicing not supported")
+            raise ValueError('Slicing not supported')
         if isinstance(row_index, tuple) and len(row_index) == 2:
-            raise ValueError("Cannot delete a single cell!")
+            raise ValueError('Cannot delete a single cell!')
         del self._rows[row_index]
         self._mlb.delete(row_index)
         if self._DEBUG:
@@ -889,7 +892,7 @@ class Table(object):
         """
         if len(rowvalue) != self._num_columns:
             raise ValueError(
-                "Row %r has %d columns; expected %d"
+                'Row %r has %d columns; expected %d'
                 % (rowvalue, len(rowvalue), self._num_columns)
             )
 
@@ -947,7 +950,7 @@ class Table(object):
     # Sorting
     # /////////////////////////////////////////////////////////////////
 
-    def sort_by(self, column_index, order="toggle"):
+    def sort_by(self, column_index, order='toggle'):
         """
         Sort the rows in this table, using the specified column's
         values as a sort key.
@@ -966,7 +969,7 @@ class Table(object):
                 then reverse the rows; otherwise sort in ascending
                 order.
         """
-        if order not in ("ascending", "descending", "toggle"):
+        if order not in ('ascending', 'descending', 'toggle'):
             raise ValueError(
                 'sort_by(): order should be "ascending", ' '"descending", or "toggle".'
             )
@@ -974,11 +977,11 @@ class Table(object):
         config_cookie = self._save_config_info(index_by_id=True)
 
         # Sort the rows.
-        if order == "toggle" and column_index == self._sortkey:
+        if order == 'toggle' and column_index == self._sortkey:
             self._rows.reverse()
         else:
             self._rows.sort(
-                key=operator.itemgetter(column_index), reverse=(order == "descending")
+                key=operator.itemgetter(column_index), reverse=(order == 'descending')
             )
             self._sortkey = column_index
 
@@ -996,12 +999,12 @@ class Table(object):
         # If they click on the far-left of far-right of a column's
         # label, then resize rather than sorting.
         if self._mlb._resize_column(event):
-            return "continue"
+            return 'continue'
 
         # Otherwise, sort.
         else:
             self.sort_by(column_index)
-            return "continue"
+            return 'continue'
 
     # /////////////////////////////////////////////////////////////////
     # { Table Drawing Helpers
@@ -1016,20 +1019,20 @@ class Table(object):
         selection will also be lost -- i.e., no row will be selected
         after this call completes.
         """
-        self._mlb.delete(0, "end")
+        self._mlb.delete(0, 'end')
         for i, row in enumerate(self._rows):
             if self._reprfunc is not None:
                 row = [self._reprfunc(i, j, v) for (j, v) in enumerate(row)]
-            self._mlb.insert("end", row)
+            self._mlb.insert('end', row)
 
     def _get_itemconfig(self, r, c):
         return dict(
             (k, self._mlb.itemconfig(r, c, k)[-1])
             for k in (
-                "foreground",
-                "selectforeground",
-                "background",
-                "selectbackground",
+                'foreground',
+                'selectforeground',
+                'background',
+                'selectbackground',
             )
         )
 
@@ -1083,7 +1086,7 @@ class Table(object):
 
         # Clear the selection.
         if selection is None:
-            self._mlb.selection_clear(0, "end")
+            self._mlb.selection_clear(0, 'end')
 
         # Restore selection & color config
         if index_by_id:
@@ -1135,46 +1138,46 @@ class Table(object):
 # update this to use new WordNet API
 def demo():
     root = Tk()
-    root.bind("<Control-q>", lambda e: root.destroy())
+    root.bind('<Control-q>', lambda e: root.destroy())
 
     table = Table(
         root,
-        "Word Synset Hypernym Hyponym".split(),
+        'Word Synset Hypernym Hyponym'.split(),
         column_weights=[0, 1, 1, 1],
-        reprfunc=(lambda i, j, s: "  %s" % s),
+        reprfunc=(lambda i, j, s: '  %s' % s),
     )
-    table.pack(expand=True, fill="both")
+    table.pack(expand=True, fill='both')
 
     from nltk.corpus import wordnet
     from nltk.corpus import brown
 
     for word, pos in sorted(set(brown.tagged_words()[:500])):
-        if pos[0] != "N":
+        if pos[0] != 'N':
             continue
         word = word.lower()
         for synset in wordnet.synsets(word):
             try:
                 hyper_def = synset.hypernyms()[0].definition()
             except:
-                hyper_def = "*none*"
+                hyper_def = '*none*'
             try:
                 hypo_def = synset.hypernyms()[0].definition()
             except:
-                hypo_def = "*none*"
+                hypo_def = '*none*'
             table.append([word, synset.definition(), hyper_def, hypo_def])
 
-    table.columnconfig("Word", background="#afa")
-    table.columnconfig("Synset", background="#efe")
-    table.columnconfig("Hypernym", background="#fee")
-    table.columnconfig("Hyponym", background="#ffe")
+    table.columnconfig('Word', background='#afa')
+    table.columnconfig('Synset', background='#efe')
+    table.columnconfig('Hypernym', background='#fee')
+    table.columnconfig('Hyponym', background='#ffe')
     for row in range(len(table)):
-        for column in ("Hypernym", "Hyponym"):
-            if table[row, column] == "*none*":
+        for column in ('Hypernym', 'Hyponym'):
+            if table[row, column] == '*none*':
                 table.itemconfig(
-                    row, column, foreground="#666", selectforeground="#666"
+                    row, column, foreground='#666', selectforeground='#666'
                 )
     root.mainloop()
 
 
-if __name__ == "__main__":
+if __name__ == '__main__':
     demo()
index 33bfb9a..8124f5e 100644 (file)
@@ -1,6 +1,6 @@
 # Natural Language Toolkit: Graphical Representations for Trees
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Edward Loper <edloper@gmail.com>
 # URL: <http://nltk.org/>
 # For license information, see LICENSE.TXT
@@ -9,7 +9,7 @@
 Graphically display a Tree.
 """
 
-from tkinter import IntVar, Menu, Tk
+from six.moves.tkinter import IntVar, Menu, Tk
 
 from nltk.util import in_idle
 from nltk.tree import Tree
@@ -77,9 +77,9 @@ class TreeSegmentWidget(CanvasWidget):
         self._ordered = False
 
         # Create canvas objects.
-        self._lines = [canvas.create_line(0, 0, 0, 0, fill="#006060") for c in subtrees]
+        self._lines = [canvas.create_line(0, 0, 0, 0, fill='#006060') for c in subtrees]
         self._polygon = canvas.create_polygon(
-            0, 0, fill="", state="hidden", outline="#006060"
+            0, 0, fill='', state='hidden', outline='#006060'
         )
 
         # Register child widgets (label + subtrees)
@@ -94,68 +94,68 @@ class TreeSegmentWidget(CanvasWidget):
 
     def __setitem__(self, attr, value):
         canvas = self.canvas()
-        if attr == "roof":
+        if attr == 'roof':
             self._roof = value
             if self._roof:
                 for l in self._lines:
-                    canvas.itemconfig(l, state="hidden")
-                canvas.itemconfig(self._polygon, state="normal")
+                    canvas.itemconfig(l, state='hidden')
+                canvas.itemconfig(self._polygon, state='normal')
             else:
                 for l in self._lines:
-                    canvas.itemconfig(l, state="normal")
-                canvas.itemconfig(self._polygon, state="hidden")
-        elif attr == "orientation":
-            if value == "horizontal":
+                    canvas.itemconfig(l, state='normal')
+                canvas.itemconfig(self._polygon, state='hidden')
+        elif attr == 'orientation':
+            if value == 'horizontal':
                 self._horizontal = 1
-            elif value == "vertical":
+            elif value == 'vertical':
                 self._horizontal = 0
             else:
-                raise ValueError("orientation must be horizontal or vertical")
-        elif attr == "color":
+                raise ValueError('orientation must be horizontal or vertical')
+        elif attr == 'color':
             for l in self._lines:
                 canvas.itemconfig(l, fill=value)
             canvas.itemconfig(self._polygon, outline=value)
-        elif isinstance(attr, tuple) and attr[0] == "color":
+        elif isinstance(attr, tuple) and attr[0] == 'color':
             # Set the color of an individual line.
             l = self._lines[int(attr[1])]
             canvas.itemconfig(l, fill=value)
-        elif attr == "fill":
+        elif attr == 'fill':
             canvas.itemconfig(self._polygon, fill=value)
-        elif attr == "width":
+        elif attr == 'width':
             canvas.itemconfig(self._polygon, {attr: value})
             for l in self._lines:
                 canvas.itemconfig(l, {attr: value})
-        elif attr in ("xspace", "yspace"):
-            if attr == "xspace":
+        elif attr in ('xspace', 'yspace'):
+            if attr == 'xspace':
                 self._xspace = value
-            elif attr == "yspace":
+            elif attr == 'yspace':
                 self._yspace = value
             self.update(self._label)
-        elif attr == "ordered":
+        elif attr == 'ordered':
             self._ordered = value
         else:
             CanvasWidget.__setitem__(self, attr, value)
 
     def __getitem__(self, attr):
-        if attr == "roof":
+        if attr == 'roof':
             return self._roof
-        elif attr == "width":
+        elif attr == 'width':
             return self.canvas().itemcget(self._polygon, attr)
-        elif attr == "color":
-            return self.canvas().itemcget(self._polygon, "outline")
-        elif isinstance(attr, tuple) and attr[0] == "color":
+        elif attr == 'color':
+            return self.canvas().itemcget(self._polygon, 'outline')
+        elif isinstance(attr, tuple) and attr[0] == 'color':
             l = self._lines[int(attr[1])]
-            return self.canvas().itemcget(l, "fill")
-        elif attr == "xspace":
+            return self.canvas().itemcget(l, 'fill')
+        elif attr == 'xspace':
             return self._xspace
-        elif attr == "yspace":
+        elif attr == 'yspace':
             return self._yspace
-        elif attr == "orientation":
+        elif attr == 'orientation':
             if self._horizontal:
-                return "horizontal"
+                return 'horizontal'
             else:
-                return "vertical"
-        elif attr == "ordered":
+                return 'vertical'
+        elif attr == 'ordered':
             return self._ordered
         else:
             return CanvasWidget.__getitem__(self, attr)
@@ -196,7 +196,7 @@ class TreeSegmentWidget(CanvasWidget):
         canvas = self.canvas()
         self._subtrees.insert(index, child)
         self._add_child_widget(child)
-        self._lines.append(canvas.create_line(0, 0, 0, 0, fill="#006060"))
+        self._lines.append(canvas.create_line(0, 0, 0, 0, fill='#006060'))
         self.update(self._label)
 
     # but.. lines???
@@ -416,7 +416,7 @@ class TreeSegmentWidget(CanvasWidget):
         self._managing = False
 
     def __repr__(self):
-        return "[TreeSeg %s: %s]" % (self._label, self._subtrees)
+        return '[TreeSeg %s: %s]' % (self._label, self._subtrees)
 
 
 def _tree_to_treeseg(
@@ -479,16 +479,16 @@ def tree_to_treesegment(
     loc_attribs = {}
 
     for (key, value) in list(attribs.items()):
-        if key[:5] == "tree_":
+        if key[:5] == 'tree_':
             tree_attribs[key[5:]] = value
-        elif key[:5] == "node_":
+        elif key[:5] == 'node_':
             node_attribs[key[5:]] = value
-        elif key[:5] == "leaf_":
+        elif key[:5] == 'leaf_':
             leaf_attribs[key[5:]] = value
-        elif key[:4] == "loc_":
+        elif key[:4] == 'loc_':
             loc_attribs[key[4:]] = value
         else:
-            raise ValueError("Bad attribute: %s" % key)
+            raise ValueError('Bad attribute: %s' % key)
     return _tree_to_treeseg(
         canvas,
         t,
@@ -558,15 +558,15 @@ class TreeWidget(CanvasWidget):
         # Attributes.
         self._nodeattribs = {}
         self._leafattribs = {}
-        self._locattribs = {"color": "#008000"}
-        self._line_color = "#008080"
+        self._locattribs = {'color': '#008000'}
+        self._line_color = '#008080'
         self._line_width = 1
-        self._roof_color = "#008080"
-        self._roof_fill = "#c0c0c0"
+        self._roof_color = '#008080'
+        self._roof_fill = '#c0c0c0'
         self._shapeable = False
         self._xspace = 10
         self._yspace = 10
-        self._orientation = "vertical"
+        self._orientation = 'vertical'
         self._ordered = False
 
         # Build trees.
@@ -712,90 +712,90 @@ class TreeWidget(CanvasWidget):
             return leaf
 
     def __setitem__(self, attr, value):
-        if attr[:5] == "node_":
+        if attr[:5] == 'node_':
             for node in self._nodes:
                 node[attr[5:]] = value
-        elif attr[:5] == "leaf_":
+        elif attr[:5] == 'leaf_':
             for leaf in self._leaves:
                 leaf[attr[5:]] = value
-        elif attr == "line_color":
+        elif attr == 'line_color':
             self._line_color = value
             for tseg in list(self._expanded_trees.values()):
-                tseg["color"] = value
-        elif attr == "line_width":
+                tseg['color'] = value
+        elif attr == 'line_width':
             self._line_width = value
             for tseg in list(self._expanded_trees.values()):
-                tseg["width"] = value
+                tseg['width'] = value
             for tseg in list(self._collapsed_trees.values()):
-                tseg["width"] = value
-        elif attr == "roof_color":
+                tseg['width'] = value
+        elif attr == 'roof_color':
             self._roof_color = value
             for tseg in list(self._collapsed_trees.values()):
-                tseg["color"] = value
-        elif attr == "roof_fill":
+                tseg['color'] = value
+        elif attr == 'roof_fill':
             self._roof_fill = value
             for tseg in list(self._collapsed_trees.values()):
-                tseg["fill"] = value
-        elif attr == "shapeable":
+                tseg['fill'] = value
+        elif attr == 'shapeable':
             self._shapeable = value
             for tseg in list(self._expanded_trees.values()):
-                tseg["draggable"] = value
+                tseg['draggable'] = value
             for tseg in list(self._collapsed_trees.values()):
-                tseg["draggable"] = value
+                tseg['draggable'] = value
             for leaf in self._leaves:
-                leaf["draggable"] = value
-        elif attr == "xspace":
+                leaf['draggable'] = value
+        elif attr == 'xspace':
             self._xspace = value
             for tseg in list(self._expanded_trees.values()):
-                tseg["xspace"] = value
+                tseg['xspace'] = value
             for tseg in list(self._collapsed_trees.values()):
-                tseg["xspace"] = value
+                tseg['xspace'] = value
             self.manage()
-        elif attr == "yspace":
+        elif attr == 'yspace':
             self._yspace = value
             for tseg in list(self._expanded_trees.values()):
-                tseg["yspace"] = value
+                tseg['yspace'] = value
             for tseg in list(self._collapsed_trees.values()):
-                tseg["yspace"] = value
+                tseg['yspace'] = value
             self.manage()
-        elif attr == "orientation":
+        elif attr == 'orientation':
             self._orientation = value
             for tseg in list(self._expanded_trees.values()):
-                tseg["orientation"] = value
+                tseg['orientation'] = value
             for tseg in list(self._collapsed_trees.values()):
-                tseg["orientation"] = value
+                tseg['orientation'] = value
             self.manage()
-        elif attr == "ordered":
+        elif attr == 'ordered':
             self._ordered = value
             for tseg in list(self._expanded_trees.values()):
-                tseg["ordered"] = value
+                tseg['ordered'] = value
             for tseg in list(self._collapsed_trees.values()):
-                tseg["ordered"] = value
+                tseg['ordered'] = value
         else:
             CanvasWidget.__setitem__(self, attr, value)
 
     def __getitem__(self, attr):
-        if attr[:5] == "node_":
+        if attr[:5] == 'node_':
             return self._nodeattribs.get(attr[5:], None)
-        elif attr[:5] == "leaf_":
+        elif attr[:5] == 'leaf_':
             return self._leafattribs.get(attr[5:], None)
-        elif attr[:4] == "loc_":
+        elif attr[:4] == 'loc_':
             return self._locattribs.get(attr[4:], None)
-        elif attr == "line_color":
+        elif attr == 'line_color':
             return self._line_color
-        elif attr == "line_width":
+        elif attr == 'line_width':
             return self._line_width
-        elif attr == "roof_color":
+        elif attr == 'roof_color':
             return self._roof_color
-        elif attr == "roof_fill":
+        elif attr == 'roof_fill':
             return self._roof_fill
-        elif attr == "shapeable":
+        elif attr == 'shapeable':
             return self._shapeable
-        elif attr == "xspace":
+        elif attr == 'xspace':
             return self._xspace
-        elif attr == "yspace":
+        elif attr == 'yspace':
             return self._yspace
-        elif attr == "orientation":
+        elif attr == 'orientation':
             return self._orientation
         else:
             return CanvasWidget.__getitem__(self, attr)
@@ -818,7 +818,7 @@ class TreeWidget(CanvasWidget):
         Collapse/expand a tree.
         """
         old_treeseg = treeseg
-        if old_treeseg["roof"]:
+        if old_treeseg['roof']:
             new_treeseg = self._expanded_trees[self._keys[old_treeseg]]
         else:
             new_treeseg = self._collapsed_trees[self._keys[old_treeseg]]
@@ -857,18 +857,18 @@ class TreeView(object):
         self._trees = trees
 
         self._top = Tk()
-        self._top.title("NLTK")
-        self._top.bind("<Control-x>", self.destroy)
-        self._top.bind("<Control-q>", self.destroy)
+        self._top.title('NLTK')
+        self._top.bind('<Control-x>', self.destroy)
+        self._top.bind('<Control-q>', self.destroy)
 
         cf = self._cframe = CanvasFrame(self._top)
-        self._top.bind("<Control-p>", self._cframe.print_to_file)
+        self._top.bind('<Control-p>', self._cframe.print_to_file)
 
         # Size is variable.
         self._size = IntVar(self._top)
         self._size.set(12)
-        bold = ("helvetica", -self._size.get(), "bold")
-        helv = ("helvetica", -self._size.get())
+        bold = ('helvetica', -self._size.get(), 'bold')
+        helv = ('helvetica', -self._size.get())
 
         # Lay the trees out in a square.
         self._width = int(ceil(sqrt(len(trees))))
@@ -878,11 +878,11 @@ class TreeView(object):
                 cf.canvas(),
                 trees[i],
                 node_font=bold,
-                leaf_color="#008040",
-                node_color="#004080",
-                roof_color="#004040",
-                roof_fill="white",
-                line_color="#004040",
+                leaf_color='#008040',
+                node_color='#004080',
+                roof_color='#004040',
+                roof_fill='white',
+                line_color='#004040',
                 draggable=1,
                 leaf_font=helv,
             )
@@ -891,7 +891,7 @@ class TreeView(object):
             cf.add_widget(widget, 0, 0)
 
         self._layout()
-        self._cframe.pack(expand=1, fill="both")
+        self._cframe.pack(expand=1, fill='both')
         self._init_menubar()
 
     def _layout(self):
@@ -912,72 +912,72 @@ class TreeView(object):
 
         filemenu = Menu(menubar, tearoff=0)
         filemenu.add_command(
-            label="Print to Postscript",
+            label='Print to Postscript',
             underline=0,
             command=self._cframe.print_to_file,
-            accelerator="Ctrl-p",
+            accelerator='Ctrl-p',
         )
         filemenu.add_command(
-            label="Exit", underline=1, command=self.destroy, accelerator="Ctrl-x"
+            label='Exit', underline=1, command=self.destroy, accelerator='Ctrl-x'
         )
-        menubar.add_cascade(label="File", underline=0, menu=filemenu)
+        menubar.add_cascade(label='File', underline=0, menu=filemenu)
 
         zoommenu = Menu(menubar, tearoff=0)
         zoommenu.add_radiobutton(
-            label="Tiny",
+            label='Tiny',
             variable=self._size,
             underline=0,
             value=10,
             command=self.resize,
         )
         zoommenu.add_radiobutton(
-            label="Small",
+            label='Small',
             variable=self._size,
             underline=0,
             value=12,
             command=self.resize,
         )
         zoommenu.add_radiobutton(
-            label="Medium",
+            label='Medium',
             variable=self._size,
             underline=0,
             value=14,
             command=self.resize,
         )
         zoommenu.add_radiobutton(
-            label="Large",
+            label='Large',
             variable=self._size,
             underline=0,
             value=28,
             command=self.resize,
         )
         zoommenu.add_radiobutton(
-            label="Huge",
+            label='Huge',
             variable=self._size,
             underline=0,
             value=50,
             command=self.resize,
         )
-        menubar.add_cascade(label="Zoom", underline=0, menu=zoommenu)
+        menubar.add_cascade(label='Zoom', underline=0, menu=zoommenu)
 
         self._top.config(menu=menubar)
 
     def resize(self, *e):
-        bold = ("helvetica", -self._size.get(), "bold")
-        helv = ("helvetica", -self._size.get())
+        bold = ('helvetica', -self._size.get(), 'bold')
+        helv = ('helvetica', -self._size.get())
         xspace = self._size.get()
         yspace = self._size.get()
         for widget in self._widgets:
-            widget["node_font"] = bold
-            widget["leaf_font"] = helv
-            widget["xspace"] = xspace
-            widget["yspace"] = yspace
+            widget['node_font'] = bold
+            widget['leaf_font'] = helv
+            widget['xspace'] = xspace
+            widget['yspace'] = yspace
             if self._size.get() < 20:
-                widget["line_width"] = 1
+                widget['line_width'] = 1
             elif self._size.get() < 30:
-                widget["line_width"] = 2
+                widget['line_width'] = 2
             else:
-                widget["line_width"] = 3
+                widget['line_width'] = 3
         self._layout()
 
     def destroy(self, *e):
@@ -1018,45 +1018,45 @@ def demo():
     import random
 
     def fill(cw):
-        cw["fill"] = "#%06d" % random.randint(0, 999999)
+        cw['fill'] = '#%06d' % random.randint(0, 999999)
 
     cf = CanvasFrame(width=550, height=450, closeenough=2)
 
     t = Tree.fromstring(
-        """
+        '''
     (S (NP the very big cat)
-       (VP (Adv sorta) (V saw) (NP (Det the) (N dog))))"""
+       (VP (Adv sorta) (V saw) (NP (Det the) (N dog))))'''
     )
 
     tc = TreeWidget(
         cf.canvas(),
         t,
         draggable=1,
-        node_font=("helvetica", -14, "bold"),
-        leaf_font=("helvetica", -12, "italic"),
-        roof_fill="white",
-        roof_color="black",
-        leaf_color="green4",
-        node_color="blue2",
+        node_font=('helvetica', -14, 'bold'),
+        leaf_font=('helvetica', -12, 'italic'),
+        roof_fill='white',
+        roof_color='black',
+        leaf_color='green4',
+        node_color='blue2',
     )
     cf.add_widget(tc, 10, 10)
 
     def boxit(canvas, text):
-        big = ("helvetica", -16, "bold")
-        return BoxWidget(canvas, TextWidget(canvas, text, font=big), fill="green")
+        big = ('helvetica', -16, 'bold')
+        return BoxWidget(canvas, TextWidget(canvas, text, font=big), fill='green')
 
     def ovalit(canvas, text):
-        return OvalWidget(canvas, TextWidget(canvas, text), fill="cyan")
+        return OvalWidget(canvas, TextWidget(canvas, text), fill='cyan')
 
-    treetok = Tree.fromstring("(S (NP this tree) (VP (V is) (AdjP shapeable)))")
+    treetok = Tree.fromstring('(S (NP this tree) (VP (V is) (AdjP shapeable)))')
     tc2 = TreeWidget(cf.canvas(), treetok, boxit, ovalit, shapeable=1)
 
     def color(node):
-        node["color"] = "#%04d00" % random.randint(0, 9999)
+        node['color'] = '#%04d00' % random.randint(0, 9999)
 
     def color2(treeseg):
-        treeseg.label()["fill"] = "#%06d" % random.randint(0, 9999)
-        treeseg.label().child()["color"] = "white"
+        treeseg.label()['fill'] = '#%06d' % random.randint(0, 9999)
+        treeseg.label().child()['color'] = 'white'
 
     tc.bind_click_trees(tc.toggle_collapsed)
     tc2.bind_click_trees(tc2.toggle_collapsed)
@@ -1068,29 +1068,29 @@ def demo():
     cf.add_widget(paren, tc.bbox()[2] + 10, 10)
 
     tree3 = Tree.fromstring(
-        """
+        '''
     (S (NP this tree) (AUX was)
-       (VP (V built) (PP (P with) (NP (N tree_to_treesegment)))))"""
+       (VP (V built) (PP (P with) (NP (N tree_to_treesegment)))))'''
     )
     tc3 = tree_to_treesegment(
-        cf.canvas(), tree3, tree_color="green4", tree_xspace=2, tree_width=2
+        cf.canvas(), tree3, tree_color='green4', tree_xspace=2, tree_width=2
     )
-    tc3["draggable"] = 1
+    tc3['draggable'] = 1
     cf.add_widget(tc3, 10, tc.bbox()[3] + 10)
 
     def orientswitch(treewidget):
-        if treewidget["orientation"] == "horizontal":
-            treewidget.expanded_tree(1, 1).subtrees()[0].set_text("vertical")
-            treewidget.collapsed_tree(1, 1).subtrees()[0].set_text("vertical")
-            treewidget.collapsed_tree(1).subtrees()[1].set_text("vertical")
-            treewidget.collapsed_tree().subtrees()[3].set_text("vertical")
-            treewidget["orientation"] = "vertical"
+        if treewidget['orientation'] == 'horizontal':
+            treewidget.expanded_tree(1, 1).subtrees()[0].set_text('vertical')
+            treewidget.collapsed_tree(1, 1).subtrees()[0].set_text('vertical')
+            treewidget.collapsed_tree(1).subtrees()[1].set_text('vertical')
+            treewidget.collapsed_tree().subtrees()[3].set_text('vertical')
+            treewidget['orientation'] = 'vertical'
         else:
-            treewidget.expanded_tree(1, 1).subtrees()[0].set_text("horizontal")
-            treewidget.collapsed_tree(1, 1).subtrees()[0].set_text("horizontal")
-            treewidget.collapsed_tree(1).subtrees()[1].set_text("horizontal")
-            treewidget.collapsed_tree().subtrees()[3].set_text("horizontal")
-            treewidget["orientation"] = "horizontal"
+            treewidget.expanded_tree(1, 1).subtrees()[0].set_text('horizontal')
+            treewidget.collapsed_tree(1, 1).subtrees()[0].set_text('horizontal')
+            treewidget.collapsed_tree(1).subtrees()[1].set_text('horizontal')
+            treewidget.collapsed_tree().subtrees()[3].set_text('horizontal')
+            treewidget['orientation'] = 'horizontal'
 
     text = """
 Try clicking, right clicking, and dragging
@@ -1102,19 +1102,19 @@ constructors for the nodes & leaves (BoxWidget
 and OvalWidget).  The bottom-left tree is
 built from tree_to_treesegment."""
     twidget = TextWidget(cf.canvas(), text.strip())
-    textbox = BoxWidget(cf.canvas(), twidget, fill="white", draggable=1)
+    textbox = BoxWidget(cf.canvas(), twidget, fill='white', draggable=1)
     cf.add_widget(textbox, tc3.bbox()[2] + 10, tc2.bbox()[3] + 10)
 
-    tree4 = Tree.fromstring("(S (NP this tree) (VP (V is) (Adj horizontal)))")
+    tree4 = Tree.fromstring('(S (NP this tree) (VP (V is) (Adj horizontal)))')
     tc4 = TreeWidget(
         cf.canvas(),
         tree4,
         draggable=1,
-        line_color="brown2",
-        roof_color="brown2",
-        node_font=("helvetica", -12, "bold"),
-        node_color="brown4",
-        orientation="horizontal",
+        line_color='brown2',
+        roof_color='brown2',
+        node_font=('helvetica', -12, 'bold'),
+        node_color='brown4',
+        orientation='horizontal',
     )
     tc4.manage()
     cf.add_widget(tc4, tc3.bbox()[2] + 10, textbox.bbox()[3] + 10)
@@ -1125,5 +1125,5 @@ built from tree_to_treesegment."""
     cf.mainloop()
 
 
-if __name__ == "__main__":
+if __name__ == '__main__':
     demo()
index be2db58..9daebbb 100644 (file)
@@ -1,6 +1,6 @@
 # Natural Language Toolkit: Drawing utilities
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Edward Loper <edloper@gmail.com>
 # URL: <http://nltk.org/>
 # For license information, see LICENSE.TXT
@@ -34,7 +34,8 @@ homepage (http://www.ags.uni-sb.de/~konrad/clig.html).
 
 """
 from abc import ABCMeta, abstractmethod
-from tkinter import (
+from six import add_metaclass
+from six.moves.tkinter import (
     Button,
     Canvas,
     Entry,
@@ -50,7 +51,7 @@ from tkinter import (
     Widget,
     RAISED,
 )
-from tkinter.filedialog import asksaveasfilename
+from six.moves.tkinter_tkfiledialog import asksaveasfilename
 
 from nltk.util import in_idle
 
@@ -59,7 +60,8 @@ from nltk.util import in_idle
 ##//////////////////////////////////////////////////////
 
 
-class CanvasWidget(metaclass=ABCMeta):
+@add_metaclass(ABCMeta)
+class CanvasWidget(object):
     """
     A collection of graphical elements and bindings used to display a
     complex object on a Tkinter ``Canvas``.  A canvas widget is
@@ -188,17 +190,17 @@ class CanvasWidget(metaclass=ABCMeta):
         :param attribs: The new canvas widget's attributes.
         """
         if self.__class__ == CanvasWidget:
-            raise TypeError("CanvasWidget is an abstract base class")
+            raise TypeError('CanvasWidget is an abstract base class')
 
         if not isinstance(canvas, Canvas):
-            raise TypeError("Expected a canvas!")
+            raise TypeError('Expected a canvas!')
 
         self.__canvas = canvas
         self.__parent = parent
 
         # If the subclass constructor called _add_child_widget, then
         # self.__children will already exist.
-        if not hasattr(self, "_CanvasWidget__children"):
+        if not hasattr(self, '_CanvasWidget__children'):
             self.__children = []
 
         # Is this widget hidden?
@@ -222,9 +224,9 @@ class CanvasWidget(metaclass=ABCMeta):
 
         # Register any new bindings
         for tag in self._tags():
-            self.__canvas.tag_bind(tag, "<ButtonPress-1>", self.__press_cb)
-            self.__canvas.tag_bind(tag, "<ButtonPress-2>", self.__press_cb)
-            self.__canvas.tag_bind(tag, "<ButtonPress-3>", self.__press_cb)
+            self.__canvas.tag_bind(tag, '<ButtonPress-1>', self.__press_cb)
+            self.__canvas.tag_bind(tag, '<ButtonPress-2>', self.__press_cb)
+            self.__canvas.tag_bind(tag, '<ButtonPress-3>', self.__press_cb)
 
     ##//////////////////////////////////////////////////////
     ##  Inherited methods.
@@ -242,7 +244,7 @@ class CanvasWidget(metaclass=ABCMeta):
         if self.__hidden:
             return (0, 0, 0, 0)
         if len(self.tags()) == 0:
-            raise ValueError("No tags")
+            raise ValueError('No tags')
         return self.__canvas.bbox(*self.tags())
 
     def width(self):
@@ -252,7 +254,7 @@ class CanvasWidget(metaclass=ABCMeta):
         :rtype: int
         """
         if len(self.tags()) == 0:
-            raise ValueError("No tags")
+            raise ValueError('No tags')
         bbox = self.__canvas.bbox(*self.tags())
         return bbox[2] - bbox[0]
 
@@ -263,7 +265,7 @@ class CanvasWidget(metaclass=ABCMeta):
         :rtype: int
         """
         if len(self.tags()) == 0:
-            raise ValueError("No tags")
+            raise ValueError('No tags')
         bbox = self.__canvas.bbox(*self.tags())
         return bbox[3] - bbox[1]
 
@@ -314,7 +316,7 @@ class CanvasWidget(metaclass=ABCMeta):
         if self.__parent:
             self.__parent.update(self)
 
-    def moveto(self, x, y, anchor="NW"):
+    def moveto(self, x, y, anchor='NW'):
         """
         Move this canvas widget to the given location.  In particular,
         shift the canvas widget such that the corner or side of the
@@ -329,21 +331,21 @@ class CanvasWidget(metaclass=ABCMeta):
             corner; etc.
         """
         x1, y1, x2, y2 = self.bbox()
-        if anchor == "NW":
+        if anchor == 'NW':
             self.move(x - x1, y - y1)
-        if anchor == "N":
+        if anchor == 'N':
             self.move(x - x1 / 2 - x2 / 2, y - y1)
-        if anchor == "NE":
+        if anchor == 'NE':
             self.move(x - x2, y - y1)
-        if anchor == "E":
+        if anchor == 'E':
             self.move(x - x2, y - y1 / 2 - y2 / 2)
-        if anchor == "SE":
+        if anchor == 'SE':
             self.move(x - x2, y - y2)
-        if anchor == "S":
+        if anchor == 'S':
             self.move(x - x1 / 2 - x2 / 2, y - y2)
-        if anchor == "SW":
+        if anchor == 'SW':
             self.move(x - x1, y - y2)
-        if anchor == "W":
+        if anchor == 'W':
             self.move(x - x1, y - y1 / 2 - y2 / 2)
 
     def destroy(self):
@@ -365,9 +367,9 @@ class CanvasWidget(metaclass=ABCMeta):
             return
 
         for tag in self.tags():
-            self.__canvas.tag_unbind(tag, "<ButtonPress-1>")
-            self.__canvas.tag_unbind(tag, "<ButtonPress-2>")
-            self.__canvas.tag_unbind(tag, "<ButtonPress-3>")
+            self.__canvas.tag_unbind(tag, '<ButtonPress-1>')
+            self.__canvas.tag_unbind(tag, '<ButtonPress-2>')
+            self.__canvas.tag_unbind(tag, '<ButtonPress-3>')
         self.__canvas.delete(*self.tags())
         self.__canvas = None
 
@@ -418,7 +420,7 @@ class CanvasWidget(metaclass=ABCMeta):
         :rtype: list of int
         """
         if self.__canvas is None:
-            raise ValueError("Attempt to access a destroyed canvas widget")
+            raise ValueError('Attempt to access a destroyed canvas widget')
         tags = []
         tags += self._tags()
         for child in self.__children:
@@ -433,10 +435,10 @@ class CanvasWidget(metaclass=ABCMeta):
 
         :rtype: None
         """
-        if attr == "draggable":
+        if attr == 'draggable':
             self.__draggable = value
         else:
-            raise ValueError("Unknown attribute %r" % attr)
+            raise ValueError('Unknown attribute %r' % attr)
 
     def __getitem__(self, attr):
         """
@@ -445,17 +447,17 @@ class CanvasWidget(metaclass=ABCMeta):
             canvas widget.
         :rtype: (any)
         """
-        if attr == "draggable":
+        if attr == 'draggable':
             return self.__draggable
         else:
-            raise ValueError("Unknown attribute %r" % attr)
+            raise ValueError('Unknown attribute %r' % attr)
 
     def __repr__(self):
         """
         :return: a string representation of this canvas widget.
         :rtype: str
         """
-        return "<%s>" % self.__class__.__name__
+        return '<%s>' % self.__class__.__name__
 
     def hide(self):
         """
@@ -465,7 +467,7 @@ class CanvasWidget(metaclass=ABCMeta):
         """
         self.__hidden = 1
         for tag in self.tags():
-            self.__canvas.itemconfig(tag, state="hidden")
+            self.__canvas.itemconfig(tag, state='hidden')
 
     def show(self):
         """
@@ -475,7 +477,7 @@ class CanvasWidget(metaclass=ABCMeta):
         """
         self.__hidden = 0
         for tag in self.tags():
-            self.__canvas.itemconfig(tag, state="normal")
+            self.__canvas.itemconfig(tag, state='normal')
 
     def hidden(self):
         """
@@ -516,7 +518,7 @@ class CanvasWidget(metaclass=ABCMeta):
             will be called with this ``CanvasWidget`` as its argument.
         """
         self.__draggable = 1
-        self.__callbacks["drag"] = callback
+        self.__callbacks['drag'] = callback
 
     def unbind_click(self, button=1):
         """
@@ -537,7 +539,7 @@ class CanvasWidget(metaclass=ABCMeta):
         Remove a callback that was registered with ``bind_drag``.
         """
         try:
-            del self.__callbacks["drag"]
+            del self.__callbacks['drag']
         except:
             pass
 
@@ -556,14 +558,14 @@ class CanvasWidget(metaclass=ABCMeta):
         # If we're already waiting for a button release, then ignore
         # this new button press.
         if (
-            self.__canvas.bind("<ButtonRelease-1>")
-            or self.__canvas.bind("<ButtonRelease-2>")
-            or self.__canvas.bind("<ButtonRelease-3>")
+            self.__canvas.bind('<ButtonRelease-1>')
+            or self.__canvas.bind('<ButtonRelease-2>')
+            or self.__canvas.bind('<ButtonRelease-3>')
         ):
             return
 
         # Unbind motion (just in case; this shouldn't be necessary)
-        self.__canvas.unbind("<Motion>")
+        self.__canvas.unbind('<Motion>')
 
         # Record the button press event.
         self.__press = event
@@ -573,13 +575,13 @@ class CanvasWidget(metaclass=ABCMeta):
         if event.num == 1:
             widget = self
             while widget is not None:
-                if widget["draggable"]:
+                if widget['draggable']:
                     widget.__start_drag(event)
                     break
                 widget = widget.parent()
 
         # Set up the button release callback.
-        self.__canvas.bind("<ButtonRelease-%d>" % event.num, self.__release_cb)
+        self.__canvas.bind('<ButtonRelease-%d>' % event.num, self.__release_cb)
 
     def __start_drag(self, event):
         """
@@ -587,7 +589,7 @@ class CanvasWidget(metaclass=ABCMeta):
           - register a motion callback
           - record the drag coordinates
         """
-        self.__canvas.bind("<Motion>", self.__motion_cb)
+        self.__canvas.bind('<Motion>', self.__motion_cb)
         self.__drag_x = event.x
         self.__drag_y = event.y
 
@@ -609,8 +611,8 @@ class CanvasWidget(metaclass=ABCMeta):
           - call the appropriate handler.
         """
         # Unbind the button release & motion callbacks.
-        self.__canvas.unbind("<ButtonRelease-%d>" % event.num)
-        self.__canvas.unbind("<Motion>")
+        self.__canvas.unbind('<ButtonRelease-%d>' % event.num)
+        self.__canvas.unbind('<Motion>')
 
         # Is it a click or a drag?
         if (
@@ -635,12 +637,12 @@ class CanvasWidget(metaclass=ABCMeta):
         call it.  If no ancestors have a drag callback, do nothing.
         """
         if self.__draggable:
-            if "drag" in self.__callbacks:
-                cb = self.__callbacks["drag"]
+            if 'drag' in self.__callbacks:
+                cb = self.__callbacks['drag']
                 try:
                     cb(self)
                 except:
-                    print("Error in drag callback for %r" % self)
+                    print('Error in drag callback for %r' % self)
         elif self.__parent is not None:
             self.__parent.__drag()
 
@@ -655,7 +657,7 @@ class CanvasWidget(metaclass=ABCMeta):
             # try:
             cb(self)
             # except:
-            #    print('Error in click callback for %r' % self)
+            #    print 'Error in click callback for %r' % self
             #    raise
         elif self.__parent is not None:
             self.__parent.__click(button)
@@ -677,10 +679,10 @@ class CanvasWidget(metaclass=ABCMeta):
             have a parent.
         :type child: CanvasWidget
         """
-        if not hasattr(self, "_CanvasWidget__children"):
+        if not hasattr(self, '_CanvasWidget__children'):
             self.__children = []
         if child.__parent is not None:
-            raise ValueError("{} already has a parent".format(child))
+            raise ValueError('{} already has a parent'.format(child))
         child.__parent = self
         self.__children.append(child)
 
@@ -768,19 +770,19 @@ class TextWidget(CanvasWidget):
         CanvasWidget.__init__(self, canvas, **attribs)
 
     def __setitem__(self, attr, value):
-        if attr in ("color", "font", "justify", "width"):
-            if attr == "color":
-                attr = "fill"
+        if attr in ('color', 'font', 'justify', 'width'):
+            if attr == 'color':
+                attr = 'fill'
             self.canvas().itemconfig(self._tag, {attr: value})
         else:
             CanvasWidget.__setitem__(self, attr, value)
 
     def __getitem__(self, attr):
-        if attr == "width":
+        if attr == 'width':
             return int(self.canvas().itemcget(self._tag, attr))
-        elif attr in ("color", "font", "justify"):
-            if attr == "color":
-                attr = "fill"
+        elif attr in ('color', 'font', 'justify'):
+            if attr == 'color':
+                attr = 'fill'
             return self.canvas().itemcget(self._tag, attr)
         else:
             return CanvasWidget.__getitem__(self, attr)
@@ -793,7 +795,7 @@ class TextWidget(CanvasWidget):
         :return: The text displayed by this text widget.
         :rtype: str
         """
-        return self.canvas().itemcget(self._tag, "TEXT")
+        return self.canvas().itemcget(self._tag, 'TEXT')
 
     def set_text(self, text):
         """
@@ -808,7 +810,7 @@ class TextWidget(CanvasWidget):
             self.parent().update(self)
 
     def __repr__(self):
-        return "[Text: %r]" % self._text
+        return '[Text: %r]' % self._text
 
 
 class SymbolWidget(TextWidget):
@@ -830,24 +832,24 @@ class SymbolWidget(TextWidget):
     """
 
     SYMBOLS = {
-        "neg": "\330",
-        "disj": "\332",
-        "conj": "\331",
-        "lambda": "\154",
-        "merge": "\304",
-        "forall": "\042",
-        "exists": "\044",
-        "subseteq": "\315",
-        "subset": "\314",
-        "notsubset": "\313",
-        "emptyset": "\306",
-        "imp": "\336",
-        "rightarrow": chr(222),  #'\256',
-        "equal": "\75",
-        "notequal": "\271",
-        "intersection": "\307",
-        "union": "\310",
-        "epsilon": "e",
+        'neg': '\330',
+        'disj': '\332',
+        'conj': '\331',
+        'lambda': '\154',
+        'merge': '\304',
+        'forall': '\042',
+        'exists': '\044',
+        'subseteq': '\315',
+        'subset': '\314',
+        'notsubset': '\313',
+        'emptyset': '\306',
+        'imp': '\336',
+        'rightarrow': chr(222),  #'\256',
+        'equal': '\75',
+        'notequal': '\271',
+        'intersection': '\307',
+        'union': '\310',
+        'epsilon': 'e',
     }
 
     def __init__(self, canvas, symbol, **attribs):
@@ -860,8 +862,8 @@ class SymbolWidget(TextWidget):
         :param symbol: The name of the symbol to display.
         :param attribs: The new canvas widget's attributes.
         """
-        attribs["font"] = "symbol"
-        TextWidget.__init__(self, canvas, "", **attribs)
+        attribs['font'] = 'symbol'
+        TextWidget.__init__(self, canvas, '', **attribs)
         self.set_symbol(symbol)
 
     def symbol(self):
@@ -880,12 +882,12 @@ class SymbolWidget(TextWidget):
         :param symbol: The name of the symbol to display.
         """
         if symbol not in SymbolWidget.SYMBOLS:
-            raise ValueError("Unknown symbol: %s" % symbol)
+            raise ValueError('Unknown symbol: %s' % symbol)
         self._symbol = symbol
         self.set_text(SymbolWidget.SYMBOLS[symbol])
 
     def __repr__(self):
-        return "[Symbol: %r]" % self._symbol
+        return '[Symbol: %r]' % self._symbol
 
     @staticmethod
     def symbolsheet(size=20):
@@ -899,24 +901,24 @@ class SymbolWidget(TextWidget):
         def destroy(e, top=top):
             top.destroy()
 
-        top.bind("q", destroy)
-        Button(top, text="Quit", command=top.destroy).pack(side="bottom")
-        text = Text(top, font=("helvetica", -size), width=20, height=30)
-        text.pack(side="left")
+        top.bind('q', destroy)
+        Button(top, text='Quit', command=top.destroy).pack(side='bottom')
+        text = Text(top, font=('helvetica', -size), width=20, height=30)
+        text.pack(side='left')
         sb = Scrollbar(top, command=text.yview)
-        text["yscrollcommand"] = sb.set
-        sb.pack(side="right", fill="y")
-        text.tag_config("symbol", font=("symbol", -size))
+        text['yscrollcommand'] = sb.set
+        sb.pack(side='right', fill='y')
+        text.tag_config('symbol', font=('symbol', -size))
         for i in range(256):
             if i in (0, 10):
                 continue  # null and newline
             for k, v in list(SymbolWidget.SYMBOLS.items()):
                 if v == chr(i):
-                    text.insert("end", "%-10s\t" % k)
+                    text.insert('end', '%-10s\t' % k)
                     break
             else:
-                text.insert("end", "%-10d  \t" % i)
-            text.insert("end", "[%s]\n" % chr(i), "symbol")
+                text.insert('end', '%-10d  \t' % i)
+            text.insert('end', '[%s]\n' % chr(i), 'symbol')
         top.mainloop()
 
 
@@ -973,9 +975,9 @@ class AbstractContainerWidget(CanvasWidget):
 
     def __repr__(self):
         name = self.__class__.__name__
-        if name[-6:] == "Widget":
+        if name[-6:] == 'Widget':
             name = name[:-6]
-        return "[%s: %r]" % (name, self._child)
+        return '[%s: %r]' % (name, self._child)
 
 
 class BoxWidget(AbstractContainerWidget):
@@ -1009,26 +1011,26 @@ class BoxWidget(AbstractContainerWidget):
         AbstractContainerWidget.__init__(self, canvas, child, **attribs)
 
     def __setitem__(self, attr, value):
-        if attr == "margin":
+        if attr == 'margin':
             self._margin = value
-        elif attr in ("outline", "fill", "width"):
+        elif attr in ('outline', 'fill', 'width'):
             self.canvas().itemconfig(self._box, {attr: value})
         else:
             CanvasWidget.__setitem__(self, attr, value)
 
     def __getitem__(self, attr):
-        if attr == "margin":
+        if attr == 'margin':
             return self._margin
-        elif attr == "width":
+        elif attr == 'width':
             return float(self.canvas().itemcget(self._box, attr))
-        elif attr in ("outline", "fill", "width"):
+        elif attr in ('outline', 'fill', 'width'):
             return self.canvas().itemcget(self._box, attr)
         else:
             return CanvasWidget.__getitem__(self, attr)
 
     def _update(self, child):
         (x1, y1, x2, y2) = child.bbox()
-        margin = self._margin + self["width"] / 2
+        margin = self._margin + self['width'] / 2
         self.canvas().coords(
             self._box, x1 - margin, y1 - margin, x2 + margin, y2 + margin
         )
@@ -1065,8 +1067,8 @@ class OvalWidget(AbstractContainerWidget):
         self._child = child
         self._margin = 1
         self._oval = canvas.create_oval(1, 1, 1, 1)
-        self._circle = attribs.pop("circle", False)
-        self._double = attribs.pop("double", False)
+        self._circle = attribs.pop('circle', False)
+        self._double = attribs.pop('double', False)
         if self._double:
             self._oval2 = canvas.create_oval(1, 1, 1, 1)
         else:
@@ -1076,42 +1078,42 @@ class OvalWidget(AbstractContainerWidget):
 
     def __setitem__(self, attr, value):
         c = self.canvas()
-        if attr == "margin":
+        if attr == 'margin':
             self._margin = value
-        elif attr == "double":
+        elif attr == 'double':
             if value == True and self._oval2 is None:
                 # Copy attributes & position from self._oval.
                 x1, y1, x2, y2 = c.bbox(self._oval)
-                w = self["width"] * 2
+                w = self['width'] * 2
                 self._oval2 = c.create_oval(
                     x1 - w,
                     y1 - w,
                     x2 + w,
                     y2 + w,
-                    outline=c.itemcget(self._oval, "outline"),
-                    width=c.itemcget(self._oval, "width"),
+                    outline=c.itemcget(self._oval, 'outline'),
+                    width=c.itemcget(self._oval, 'width'),
                 )
                 c.tag_lower(self._oval2)
             if value == False and self._oval2 is not None:
                 c.delete(self._oval2)
                 self._oval2 = None
-        elif attr in ("outline", "fill", "width"):
+        elif attr in ('outline', 'fill', 'width'):
             c.itemconfig(self._oval, {attr: value})
-            if self._oval2 is not None and attr != "fill":
+            if self._oval2 is not None and attr != 'fill':
                 c.itemconfig(self._oval2, {attr: value})
-            if self._oval2 is not None and attr != "fill":
+            if self._oval2 is not None and attr != 'fill':
                 self.canvas().itemconfig(self._oval2, {attr: value})
         else:
             CanvasWidget.__setitem__(self, attr, value)
 
     def __getitem__(self, attr):
-        if attr == "margin":
+        if attr == 'margin':
             return self._margin
-        elif attr == "double":
+        elif attr == 'double':
             return self._double is not None
-        elif attr == "width":
+        elif attr == 'width':
             return float(self.canvas().itemcget(self._oval, attr))
-        elif attr in ("outline", "fill", "width"):
+        elif attr in ('outline', 'fill', 'width'):
             return self.canvas().itemcget(self._oval, attr)
         else:
             return CanvasWidget.__getitem__(self, attr)
@@ -1181,25 +1183,25 @@ class ParenWidget(AbstractContainerWidget):
         :param attribs: The new canvas widget's attributes.
         """
         self._child = child
-        self._oparen = canvas.create_arc(1, 1, 1, 1, style="arc", start=90, extent=180)
-        self._cparen = canvas.create_arc(1, 1, 1, 1, style="arc", start=-90, extent=180)
+        self._oparen = canvas.create_arc(1, 1, 1, 1, style='arc', start=90, extent=180)
+        self._cparen = canvas.create_arc(1, 1, 1, 1, style='arc', start=-90, extent=180)
         AbstractContainerWidget.__init__(self, canvas, child, **attribs)
 
     def __setitem__(self, attr, value):
-        if attr == "color":
+        if attr == 'color':
             self.canvas().itemconfig(self._oparen, outline=value)
             self.canvas().itemconfig(self._cparen, outline=value)
-        elif attr == "width":
+        elif attr == 'width':
             self.canvas().itemconfig(self._oparen, width=value)
             self.canvas().itemconfig(self._cparen, width=value)
         else:
             CanvasWidget.__setitem__(self, attr, value)
 
     def __getitem__(self, attr):
-        if attr == "color":
-            return self.canvas().itemcget(self._oparen, "outline")
-        elif attr == "width":
-            return self.canvas().itemcget(self._oparen, "width")
+        if attr == 'color':
+            return self.canvas().itemcget(self._oparen, 'outline')
+        elif attr == 'width':
+            return self.canvas().itemcget(self._oparen, 'width')
         else:
             return CanvasWidget.__getitem__(self, attr)
 
@@ -1241,20 +1243,20 @@ class BracketWidget(AbstractContainerWidget):
         AbstractContainerWidget.__init__(self, canvas, child, **attribs)
 
     def __setitem__(self, attr, value):
-        if attr == "color":
+        if attr == 'color':
             self.canvas().itemconfig(self._obrack, fill=value)
             self.canvas().itemconfig(self._cbrack, fill=value)
-        elif attr == "width":
+        elif attr == 'width':
             self.canvas().itemconfig(self._obrack, width=value)
             self.canvas().itemconfig(self._cbrack, width=value)
         else:
             CanvasWidget.__setitem__(self, attr, value)
 
     def __getitem__(self, attr):
-        if attr == "color":
-            return self.canvas().itemcget(self._obrack, "outline")
-        elif attr == "width":
-            return self.canvas().itemcget(self._obrack, "width")
+        if attr == 'color':
+            return self.canvas().itemcget(self._obrack, 'outline')
+        elif attr == 'width':
+            return self.canvas().itemcget(self._obrack, 'width')
         else:
             return CanvasWidget.__getitem__(self, attr)
 
@@ -1298,7 +1300,7 @@ class SequenceWidget(CanvasWidget):
         :type children: list(CanvasWidget)
         :param attribs: The new canvas widget's attributes.
         """
-        self._align = "center"
+        self._align = 'center'
         self._space = 1
         self._ordered = False
         self._children = list(children)
@@ -1307,23 +1309,23 @@ class SequenceWidget(CanvasWidget):
         CanvasWidget.__init__(self, canvas, **attribs)
 
     def __setitem__(self, attr, value):
-        if attr == "align":
-            if value not in ("top", "bottom", "center"):
-                raise ValueError("Bad alignment: %r" % value)
+        if attr == 'align':
+            if value not in ('top', 'bottom', 'center'):
+                raise ValueError('Bad alignment: %r' % value)
             self._align = value
-        elif attr == "space":
+        elif attr == 'space':
             self._space = value
-        elif attr == "ordered":
+        elif attr == 'ordered':
             self._ordered = value
         else:
             CanvasWidget.__setitem__(self, attr, value)
 
     def __getitem__(self, attr):
-        if attr == "align":
+        if attr == 'align':
             return self._align
-        elif attr == "space":
+        elif attr == 'space':
             return self._space
-        elif attr == "ordered":
+        elif attr == 'ordered':
             return self._ordered
         else:
             return CanvasWidget.__getitem__(self, attr)
@@ -1332,11 +1334,11 @@ class SequenceWidget(CanvasWidget):
         return []
 
     def _yalign(self, top, bot):
-        if self._align == "top":
+        if self._align == 'top':
             return top
-        if self._align == "bottom":
+        if self._align == 'bottom':
             return bot
-        if self._align == "center":
+        if self._align == 'center':
             return (top + bot) / 2
 
     def _update(self, child):
@@ -1390,7 +1392,7 @@ class SequenceWidget(CanvasWidget):
             x -= x2 - x1 + self._space
 
     def __repr__(self):
-        return "[Sequence: " + repr(self._children)[1:-1] + "]"
+        return '[Sequence: ' + repr(self._children)[1:-1] + ']'
 
     # Provide an alias for the child_widgets() member.
     children = CanvasWidget.child_widgets
@@ -1470,7 +1472,7 @@ class StackWidget(CanvasWidget):
         :type children: list(CanvasWidget)
         :param attribs: The new canvas widget's attributes.
         """
-        self._align = "center"
+        self._align = 'center'
         self._space = 1
         self._ordered = False
         self._children = list(children)
@@ -1479,23 +1481,23 @@ class StackWidget(CanvasWidget):
         CanvasWidget.__init__(self, canvas, **attribs)
 
     def __setitem__(self, attr, value):
-        if attr == "align":
-            if value not in ("left", "right", "center"):
-                raise ValueError("Bad alignment: %r" % value)
+        if attr == 'align':
+            if value not in ('left', 'right', 'center'):
+                raise ValueError('Bad alignment: %r' % value)
             self._align = value
-        elif attr == "space":
+        elif attr == 'space':
             self._space = value
-        elif attr == "ordered":
+        elif attr == 'ordered':
             self._ordered = value
         else:
             CanvasWidget.__setitem__(self, attr, value)
 
     def __getitem__(self, attr):
-        if attr == "align":
+        if attr == 'align':
             return self._align
-        elif attr == "space":
+        elif attr == 'space':
             return self._space
-        elif attr == "ordered":
+        elif attr == 'ordered':
             return self._ordered
         else:
             return CanvasWidget.__getitem__(self, attr)
@@ -1504,11 +1506,11 @@ class StackWidget(CanvasWidget):
         return []
 
     def _xalign(self, left, right):
-        if self._align == "left":
+        if self._align == 'left':
             return left
-        if self._align == "right":
+        if self._align == 'right':
             return right
-        if self._align == "center":
+        if self._align == 'center':
             return (left + right) / 2
 
     def _update(self, child):
@@ -1562,7 +1564,7 @@ class StackWidget(CanvasWidget):
             y -= y2 - y1 + self._space
 
     def __repr__(self):
-        return "[Stack: " + repr(self._children)[1:-1] + "]"
+        return '[Stack: ' + repr(self._children)[1:-1] + ']'
 
     # Provide an alias for the child_widgets() member.
     children = CanvasWidget.child_widgets
@@ -1643,7 +1645,7 @@ class SpaceWidget(CanvasWidget):
             width -= 4
         if height > 4:
             height -= 4
-        self._tag = canvas.create_line(1, 1, width, height, fill="")
+        self._tag = canvas.create_line(1, 1, width, height, fill='')
         CanvasWidget.__init__(self, canvas, **attribs)
 
     # note: width() and height() are already defined by CanvasWidget.
@@ -1673,7 +1675,7 @@ class SpaceWidget(CanvasWidget):
         return [self._tag]
 
     def __repr__(self):
-        return "[Space]"
+        return '[Space]'
 
 
 class ScrollWatcherWidget(CanvasWidget):
@@ -1739,7 +1741,7 @@ class ScrollWatcherWidget(CanvasWidget):
         """
         bbox = self.bbox()
         canvas = self.canvas()
-        scrollregion = [int(n) for n in canvas["scrollregion"].split()]
+        scrollregion = [int(n) for n in canvas['scrollregion'].split()]
         if len(scrollregion) != 4:
             return
         if (
@@ -1748,13 +1750,13 @@ class ScrollWatcherWidget(CanvasWidget):
             or bbox[2] > scrollregion[2]
             or bbox[3] > scrollregion[3]
         ):
-            scrollregion = "%d %d %d %d" % (
+            scrollregion = '%d %d %d %d' % (
                 min(bbox[0], scrollregion[0]),
                 min(bbox[1], scrollregion[1]),
                 max(bbox[2], scrollregion[2]),
                 max(bbox[3], scrollregion[3]),
             )
-            canvas["scrollregion"] = scrollregion
+            canvas['scrollregion'] = scrollregion
 
 
 ##//////////////////////////////////////////////////////
@@ -1791,35 +1793,35 @@ class CanvasFrame(object):
         # If no parent was given, set up a top-level window.
         if parent is None:
             self._parent = Tk()
-            self._parent.title("NLTK")
-            self._parent.bind("<Control-p>", lambda e: self.print_to_file())
-            self._parent.bind("<Control-x>", self.destroy)
-            self._parent.bind("<Control-q>", self.destroy)
+            self._parent.title('NLTK')
+            self._parent.bind('<Control-p>', lambda e: self.print_to_file())
+            self._parent.bind('<Control-x>', self.destroy)
+            self._parent.bind('<Control-q>', self.destroy)
         else:
             self._parent = parent
 
         # Create a frame for the canvas & scrollbars
         self._frame = frame = Frame(self._parent)
         self._canvas = canvas = Canvas(frame, **kw)
-        xscrollbar = Scrollbar(self._frame, orient="horizontal")
-        yscrollbar = Scrollbar(self._frame, orient="vertical")
-        xscrollbar["command"] = canvas.xview
-        yscrollbar["command"] = canvas.yview
-        canvas["xscrollcommand"] = xscrollbar.set
-        canvas["yscrollcommand"] = yscrollbar.set
-        yscrollbar.pack(fill="y", side="right")
-        xscrollbar.pack(fill="x", side="bottom")
-        canvas.pack(expand=1, fill="both", side="left")
+        xscrollbar = Scrollbar(self._frame, orient='horizontal')
+        yscrollbar = Scrollbar(self._frame, orient='vertical')
+        xscrollbar['command'] = canvas.xview
+        yscrollbar['command'] = canvas.yview
+        canvas['xscrollcommand'] = xscrollbar.set
+        canvas['yscrollcommand'] = yscrollbar.set
+        yscrollbar.pack(fill='y', side='right')
+        xscrollbar.pack(fill='x', side='bottom')
+        canvas.pack(expand=1, fill='both', side='left')
 
         # Set initial scroll region.
-        scrollregion = "0 0 %s %s" % (canvas["width"], canvas["height"])
-        canvas["scrollregion"] = scrollregion
+        scrollregion = '0 0 %s %s' % (canvas['width'], canvas['height'])
+        canvas['scrollregion'] = scrollregion
 
         self._scrollwatcher = ScrollWatcherWidget(canvas)
 
         # If no parent was given, pack the frame, and add a menu.
         if parent is None:
-            self.pack(expand=1, fill="both")
+            self.pack(expand=1, fill='both')
             self._init_menubar()
 
     def _init_menubar(self):
@@ -1827,15 +1829,15 @@ class CanvasFrame(object):
 
         filemenu = Menu(menubar, tearoff=0)
         filemenu.add_command(
-            label="Print to Postscript",
+            label='Print to Postscript',
             underline=0,
             command=self.print_to_file,
-            accelerator="Ctrl-p",
+            accelerator='Ctrl-p',
         )
         filemenu.add_command(
-            label="Exit", underline=1, command=self.destroy, accelerator="Ctrl-x"
+            label='Exit', underline=1, command=self.destroy, accelerator='Ctrl-x'
         )
-        menubar.add_cascade(label="File", underline=0, menu=filemenu)
+        menubar.add_cascade(label='File', underline=0, menu=filemenu)
 
         self._parent.config(menu=menubar)
 
@@ -1849,8 +1851,8 @@ class CanvasFrame(object):
         :rtype: None
         """
         if filename is None:
-            ftypes = [("Postscript files", ".ps"), ("All files", "*")]
-            filename = asksaveasfilename(filetypes=ftypes, defaultextension=".ps")
+            ftypes = [('Postscript files', '.ps'), ('All files', '*')]
+            filename = asksaveasfilename(filetypes=ftypes, defaultextension='.ps')
             if not filename:
                 return
         (x0, y0, w, h) = self.scrollregion()
@@ -1865,9 +1867,9 @@ class CanvasFrame(object):
             pagey=0,
         )
         # workaround for bug in Tk font handling
-        postscript = postscript.replace(" 0 scalefont ", " 9 scalefont ")
-        with open(filename, "wb") as f:
-            f.write(postscript.encode("utf8"))
+        postscript = postscript.replace(' 0 scalefont ', ' 9 scalefont ')
+        with open(filename, 'wb') as f:
+            f.write(postscript.encode('utf8'))
 
     def scrollregion(self):
         """
@@ -1875,7 +1877,7 @@ class CanvasFrame(object):
             this ``CanvasFrame``.
         :rtype: 4-tuple of int
         """
-        (x1, y1, x2, y2) = self._canvas["scrollregion"].split()
+        (x1, y1, x2, y2) = self._canvas['scrollregion'].split()
         return (int(x1), int(y1), int(x2), int(y2))
 
     def canvas(self):
@@ -2021,32 +2023,32 @@ class ShowText(object):
             self._top = top = Toplevel(root)
         top.title(title)
 
-        b = Button(top, text="Ok", command=self.destroy)
-        b.pack(side="bottom")
+        b = Button(top, text='Ok', command=self.destroy)
+        b.pack(side='bottom')
 
         tbf = Frame(top)
-        tbf.pack(expand=1, fill="both")
-        scrollbar = Scrollbar(tbf, orient="vertical")
-        scrollbar.pack(side="right", fill="y")
-        textbox = Text(tbf, wrap="word", width=width, height=height, **textbox_options)
-        textbox.insert("end", text)
-        textbox["state"] = "disabled"
-        textbox.pack(side="left", expand=1, fill="both")
-        scrollbar["command"] = textbox.yview
-        textbox["yscrollcommand"] = scrollbar.set
+        tbf.pack(expand=1, fill='both')
+        scrollbar = Scrollbar(tbf, orient='vertical')
+        scrollbar.pack(side='right', fill='y')
+        textbox = Text(tbf, wrap='word', width=width, height=height, **textbox_options)
+        textbox.insert('end', text)
+        textbox['state'] = 'disabled'
+        textbox.pack(side='left', expand=1, fill='both')
+        scrollbar['command'] = textbox.yview
+        textbox['yscrollcommand'] = scrollbar.set
 
         # Make it easy to close the window.
-        top.bind("q", self.destroy)
-        top.bind("x", self.destroy)
-        top.bind("c", self.destroy)
-        top.bind("<Return>", self.destroy)
-        top.bind("<Escape>", self.destroy)
+        top.bind('q', self.destroy)
+        top.bind('x', self.destroy)
+        top.bind('c', self.destroy)
+        top.bind('<Return>', self.destroy)
+        top.bind('<Escape>', self.destroy)
 
         # Focus the scrollbar, so they can use up/down, etc.
         scrollbar.focus()
 
     def find_dimentions(self, text, width, height):
-        lines = text.split("\n")
+        lines = text.split('\n')
         if width is None:
             maxwidth = max(len(line) for line in lines)
             width = min(maxwidth, 80)
@@ -2055,7 +2057,7 @@ class ShowText(object):
         height = 0
         for line in lines:
             while len(line) > width:
-                brk = line[:width].rfind(" ")
+                brk = line[:width].rfind(' ')
                 line = line[brk:]
                 height += 1
             height += 1
@@ -2092,7 +2094,7 @@ class EntryDialog(object):
     """
 
     def __init__(
-        self, parent, original_text="", instructions="", set_callback=None, title=None
+        self, parent, original_text='', instructions='', set_callback=None, title=None
     ):
         self._parent = parent
         self._original_text = original_text
@@ -2106,36 +2108,36 @@ class EntryDialog(object):
 
         # The text entry box.
         entryframe = Frame(self._top)
-        entryframe.pack(expand=1, fill="both", padx=5, pady=5, ipady=10)
+        entryframe.pack(expand=1, fill='both', padx=5, pady=5, ipady=10)
         if instructions:
             l = Label(entryframe, text=instructions)
-            l.pack(side="top", anchor="w", padx=30)
+            l.pack(side='top', anchor='w', padx=30)
         self._entry = Entry(entryframe, width=width)
-        self._entry.pack(expand=1, fill="x", padx=30)
+        self._entry.pack(expand=1, fill='x', padx=30)
         self._entry.insert(0, original_text)
 
         # A divider
-        divider = Frame(self._top, borderwidth=1, relief="sunken")
-        divider.pack(fill="x", ipady=1, padx=10)
+        divider = Frame(self._top, borderwidth=1, relief='sunken')
+        divider.pack(fill='x', ipady=1, padx=10)
 
         # The buttons.
         buttons = Frame(self._top)
-        buttons.pack(expand=0, fill="x", padx=5, pady=5)
-        b = Button(buttons, text="Cancel", command=self._cancel, width=8)
-        b.pack(side="right", padx=5)
-        b = Button(buttons, text="Ok", command=self._ok, width=8, default="active")
-        b.pack(side="left", padx=5)
-        b = Button(buttons, text="Apply", command=self._apply, width=8)
-        b.pack(side="left")
-
-        self._top.bind("<Return>", self._ok)
-        self._top.bind("<Control-q>", self._cancel)
-        self._top.bind("<Escape>", self._cancel)
+        buttons.pack(expand=0, fill='x', padx=5, pady=5)
+        b = Button(buttons, text='Cancel', command=self._cancel, width=8)
+        b.pack(side='right', padx=5)
+        b = Button(buttons, text='Ok', command=self._ok, width=8, default='active')
+        b.pack(side='left', padx=5)
+        b = Button(buttons, text='Apply', command=self._apply, width=8)
+        b.pack(side='left')
+
+        self._top.bind('<Return>', self._ok)
+        self._top.bind('<Control-q>', self._cancel)
+        self._top.bind('<Escape>', self._cancel)
 
         self._entry.focus()
 
     def _reset(self, *e):
-        self._entry.delete(0, "end")
+        self._entry.delete(0, 'end')
         self._entry.insert(0, self._original_text)
         if self._set_callback:
             self._set_callback(self._original_text)
@@ -2198,8 +2200,8 @@ class ColorizedList(object):
         self._init_itemframe(options.copy())
 
         # Set up key & mouse bindings.
-        self._textwidget.bind("<KeyPress>", self._keypress)
-        self._textwidget.bind("<ButtonPress>", self._buttonpress)
+        self._textwidget.bind('<KeyPress>', self._keypress)
+        self._textwidget.bind('<ButtonPress>', self._buttonpress)
 
         # Fill in the given CFG's items.
         self._items = None
@@ -2247,17 +2249,17 @@ class ColorizedList(object):
             return
         self._items = list(items)
 
-        self._textwidget["state"] = "normal"
-        self._textwidget.delete("1.0", "end")
+        self._textwidget['state'] = 'normal'
+        self._textwidget.delete('1.0', 'end')
         for item in items:
             for (text, colortag) in self._item_repr(item):
-                assert "\n" not in text, "item repr may not contain newline"
-                self._textwidget.insert("end", text, colortag)
-            self._textwidget.insert("end", "\n")
+                assert '\n' not in text, 'item repr may not contain newline'
+                self._textwidget.insert('end', text, colortag)
+            self._textwidget.insert('end', '\n')
         # Remove the final newline
-        self._textwidget.delete("end-1char", "end")
-        self._textwidget.mark_set("insert", "1.0")
-        self._textwidget["state"] = "disabled"
+        self._textwidget.delete('end-1char', 'end')
+        self._textwidget.mark_set('insert', '1.0')
+        self._textwidget['state'] = 'disabled'
         # Clear all marks
         self._marks.clear()
 
@@ -2270,12 +2272,12 @@ class ColorizedList(object):
         """
         if item is None:
             self._marks.clear()
-            self._textwidget.tag_remove("highlight", "1.0", "end+1char")
+            self._textwidget.tag_remove('highlight', '1.0', 'end+1char')
         else:
             index = self._items.index(item)
             del self._marks[item]
-            (start, end) = ("%d.0" % (index + 1), "%d.0" % (index + 2))
-            self._textwidget.tag_remove("highlight", start, end)
+            (start, end) = ('%d.0' % (index + 1), '%d.0' % (index + 2))
+            self._textwidget.tag_remove('highlight', start, end)
 
     def mark(self, item):
         """
@@ -2284,8 +2286,8 @@ class ColorizedList(object):
         """
         self._marks[item] = 1
         index = self._items.index(item)
-        (start, end) = ("%d.0" % (index + 1), "%d.0" % (index + 2))
-        self._textwidget.tag_add("highlight", start, end)
+        (start, end) = ('%d.0' % (index + 1), '%d.0' % (index + 2))
+        self._textwidget.tag_add('highlight', start, end)
 
     def markonly(self, item):
         """
@@ -2301,7 +2303,7 @@ class ColorizedList(object):
         the item is already visible, then do nothing.
         """
         index = self._items.index(item)
-        self._textwidget.see("%d.0" % (index + 1))
+        self._textwidget.see('%d.0' % (index + 1))
 
     # ////////////////////////////////////////////////////////////
     # Callbacks
@@ -2320,10 +2322,10 @@ class ColorizedList(object):
             single item as its argument.  (The item selected
             or the item moved to).
         """
-        if event == "select":
-            events = ["click1", "space", "return"]
-        elif event == "move":
-            events = ["up", "down", "next", "prior"]
+        if event == 'select':
+            events = ['click1', 'space', 'return']
+        elif event == 'move':
+            events = ['up', 'down', 'next', 'prior']
         else:
             events = [event]
 
@@ -2337,10 +2339,10 @@ class ColorizedList(object):
         """
         if event is None:
             events = list(self._callbacks.keys())
-        elif event == "select":
-            events = ["click1", "space", "return"]
-        elif event == "move":
-            events = ["up", "down", "next", "prior"]
+        elif event == 'select':
+            events = ['click1', 'space', 'return']
+        elif event == 'move':
+            events = ['up', 'down', 'next', 'prior']
         else:
             events = [event]
 
@@ -2377,26 +2379,26 @@ class ColorizedList(object):
         self._itemframe = Frame(self._parent)
 
         # Create the basic Text widget & scrollbar.
-        options.setdefault("background", "#e0e0e0")
+        options.setdefault('background', '#e0e0e0')
         self._textwidget = Text(self._itemframe, **options)
-        self._textscroll = Scrollbar(self._itemframe, takefocus=0, orient="vertical")
+        self._textscroll = Scrollbar(self._itemframe, takefocus=0, orient='vertical')
         self._textwidget.config(yscrollcommand=self._textscroll.set)
         self._textscroll.config(command=self._textwidget.yview)
-        self._textscroll.pack(side="right", fill="y")
-        self._textwidget.pack(expand=1, fill="both", side="left")
+        self._textscroll.pack(side='right', fill='y')
+        self._textwidget.pack(expand=1, fill='both', side='left')
 
         # Initialize the colorization tags
         self._textwidget.tag_config(
-            "highlight", background="#e0ffff", border="1", relief="raised"
+            'highlight', background='#e0ffff', border='1', relief='raised'
         )
         self._init_colortags(self._textwidget, options)
 
         # How do I want to mark keyboard selection?
-        self._textwidget.tag_config("sel", foreground="")
+        self._textwidget.tag_config('sel', foreground='')
         self._textwidget.tag_config(
-            "sel", foreground="", background="", border="", underline=1
+            'sel', foreground='', background='', border='', underline=1
         )
-        self._textwidget.tag_lower("highlight", "sel")
+        self._textwidget.tag_lower('highlight', 'sel')
 
     def _fire_callback(self, event, itemnum):
         if event not in self._callbacks:
@@ -2409,38 +2411,38 @@ class ColorizedList(object):
             cb_func(item)
 
     def _buttonpress(self, event):
-        clickloc = "@%d,%d" % (event.x, event.y)
+        clickloc = '@%d,%d' % (event.x, event.y)
         insert_point = self._textwidget.index(clickloc)
-        itemnum = int(insert_point.split(".")[0]) - 1
-        self._fire_callback("click%d" % event.num, itemnum)
+        itemnum = int(insert_point.split('.')[0]) - 1
+        self._fire_callback('click%d' % event.num, itemnum)
 
     def _keypress(self, event):
-        if event.keysym == "Return" or event.keysym == "space":
-            insert_point = self._textwidget.index("insert")
-            itemnum = int(insert_point.split(".")[0]) - 1
+        if event.keysym == 'Return' or event.keysym == 'space':
+            insert_point = self._textwidget.index('insert')
+            itemnum = int(insert_point.split('.')[0]) - 1
             self._fire_callback(event.keysym.lower(), itemnum)
             return
-        elif event.keysym == "Down":
-            delta = "+1line"
-        elif event.keysym == "Up":
-            delta = "-1line"
-        elif event.keysym == "Next":
-            delta = "+10lines"
-        elif event.keysym == "Prior":
-            delta = "-10lines"
+        elif event.keysym == 'Down':
+            delta = '+1line'
+        elif event.keysym == 'Up':
+            delta = '-1line'
+        elif event.keysym == 'Next':
+            delta = '+10lines'
+        elif event.keysym == 'Prior':
+            delta = '-10lines'
         else:
-            return "continue"
+            return 'continue'
 
-        self._textwidget.mark_set("insert", "insert" + delta)
-        self._textwidget.see("insert")
-        self._textwidget.tag_remove("sel", "1.0", "end+1char")
-        self._textwidget.tag_add("sel", "insert linestart", "insert lineend")
+        self._textwidget.mark_set('insert', 'insert' + delta)
+        self._textwidget.see('insert')
+        self._textwidget.tag_remove('sel', '1.0', 'end+1char')
+        self._textwidget.tag_add('sel', 'insert linestart', 'insert lineend')
 
-        insert_point = self._textwidget.index("insert")
-        itemnum = int(insert_point.split(".")[0]) - 1
+        insert_point = self._textwidget.index('insert')
+        itemnum = int(insert_point.split('.')[0]) - 1
         self._fire_callback(event.keysym.lower(), itemnum)
 
-        return "break"
+        return 'break'
 
 
 ##//////////////////////////////////////////////////////
@@ -2450,9 +2452,9 @@ class ColorizedList(object):
 
 class MutableOptionMenu(Menubutton):
     def __init__(self, master, values, **options):
-        self._callback = options.get("command")
-        if "command" in options:
-            del options["command"]
+        self._callback = options.get('command')
+        if 'command' in options:
+            del options['command']
 
         # Create a variable
         self._variable = variable = StringVar()
@@ -2469,7 +2471,7 @@ class MutableOptionMenu(Menubutton):
         }
         kw.update(options)
         Widget.__init__(self, master, "menubutton", kw)
-        self.widgetName = "tk_optionMenu"
+        self.widgetName = 'tk_optionMenu'
         self._menu = Menu(self, name="menu", tearoff=0)
         self.menuname = self._menu._w
 
@@ -2501,7 +2503,7 @@ class MutableOptionMenu(Menubutton):
         self._menu.delete(i, i)
 
     def __getitem__(self, name):
-        if name == "menu":
+        if name == 'menu':
             return self.__menu
         return Widget.__getitem__(self, name)
 
@@ -2524,38 +2526,38 @@ def demo():
     def fill(cw):
         from random import randint
 
-        cw["fill"] = "#00%04d" % randint(0, 9999)
+        cw['fill'] = '#00%04d' % randint(0, 9999)
 
     def color(cw):
         from random import randint
 
-        cw["color"] = "#ff%04d" % randint(0, 9999)
+        cw['color'] = '#ff%04d' % randint(0, 9999)
 
     cf = CanvasFrame(closeenough=10, width=300, height=300)
     c = cf.canvas()
-    ct3 = TextWidget(c, "hiya there", draggable=1)
-    ct2 = TextWidget(c, "o  o\n||\n___\n  U", draggable=1, justify="center")
-    co = OvalWidget(c, ct2, outline="red")
-    ct = TextWidget(c, "o  o\n||\n\\___/", draggable=1, justify="center")
-    cp = ParenWidget(c, ct, color="red")
-    cb = BoxWidget(c, cp, fill="cyan", draggable=1, width=3, margin=10)
+    ct3 = TextWidget(c, 'hiya there', draggable=1)
+    ct2 = TextWidget(c, 'o  o\n||\n___\n  U', draggable=1, justify='center')
+    co = OvalWidget(c, ct2, outline='red')
+    ct = TextWidget(c, 'o  o\n||\n\\___/', draggable=1, justify='center')
+    cp = ParenWidget(c, ct, color='red')
+    cb = BoxWidget(c, cp, fill='cyan', draggable=1, width=3, margin=10)
     equation = SequenceWidget(
         c,
-        SymbolWidget(c, "forall"),
-        TextWidget(c, "x"),
-        SymbolWidget(c, "exists"),
-        TextWidget(c, "y: "),
-        TextWidget(c, "x"),
-        SymbolWidget(c, "notequal"),
-        TextWidget(c, "y"),
+        SymbolWidget(c, 'forall'),
+        TextWidget(c, 'x'),
+        SymbolWidget(c, 'exists'),
+        TextWidget(c, 'y: '),
+        TextWidget(c, 'x'),
+        SymbolWidget(c, 'notequal'),
+        TextWidget(c, 'y'),
     )
     space = SpaceWidget(c, 0, 30)
-    cstack = StackWidget(c, cb, ct3, space, co, equation, align="center")
+    cstack = StackWidget(c, cb, ct3, space, co, equation, align='center')
     prompt_msg = TextWidget(
-        c, "try clicking\nand dragging", draggable=1, justify="center"
+        c, 'try clicking\nand dragging', draggable=1, justify='center'
     )
     cs = SequenceWidget(c, cstack, prompt_msg)
-    zz = BracketWidget(c, cs, color="green4", width=3)
+    zz = BracketWidget(c, cs, color='green4', width=3)
     cf.add_widget(zz, 60, 30)
 
     cb.bind_click(fill)
@@ -2568,5 +2570,5 @@ def demo():
     # ShowText(None, 'title', ((('this is text'*150)+'\n')*5))
 
 
-if __name__ == "__main__":
+if __name__ == '__main__':
     demo()
index f90b581..9c2cdeb 100644 (file)
@@ -1,6 +1,6 @@
 # Natural Language Toolkit: Feature Structures
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Edward Loper <edloper@gmail.com>,
 #         Rob Speer,
 #         Steven Bird <stevenbird1@gmail.com>
@@ -88,11 +88,14 @@ In general, if your feature structures will contain any reentrances,
 or if you plan to use them as dictionary keys, it is strongly
 recommended that you use full-fledged ``FeatStruct`` objects.
 """
+from __future__ import print_function, unicode_literals, division
 
 import re
 import copy
 from functools import total_ordering
 
+from six import integer_types, string_types
+
 from nltk.internals import read_str, raise_unorderable_types
 from nltk.sem.logic import (
     Variable,
@@ -101,6 +104,8 @@ from nltk.sem.logic import (
     LogicParser,
     LogicalExpressionException,
 )
+from nltk.compat import python_2_unicode_compatible, unicode_repr
+
 
 ######################################################################
 # Feature Structure
@@ -178,10 +183,10 @@ class FeatStruct(SubstituteBindingsI):
                 return FeatDict.__new__(FeatDict, features, **morefeatures)
             elif morefeatures:
                 raise TypeError(
-                    "Keyword arguments may only be specified "
-                    "if features is None or is a mapping."
+                    'Keyword arguments may only be specified '
+                    'if features is None or is a mapping.'
                 )
-            if isinstance(features, str):
+            if isinstance(features, string_types):
                 if FeatStructReader._START_FDICT_RE.match(features):
                     return FeatDict.__new__(FeatDict, features, **morefeatures)
                 else:
@@ -189,7 +194,7 @@ class FeatStruct(SubstituteBindingsI):
             elif _is_sequence(features):
                 return FeatList.__new__(FeatList, features)
             else:
-                raise TypeError("Expected string or mapping or sequence")
+                raise TypeError('Expected string or mapping or sequence')
 
         # Otherwise, construct the object as normal.
         else:
@@ -267,7 +272,7 @@ class FeatStruct(SubstituteBindingsI):
         otherwise, raise ``TypeError``.
         """
         if not self._frozen:
-            raise TypeError("FeatStructs must be frozen before they " "can be hashed.")
+            raise TypeError('FeatStructs must be frozen before they ' 'can be hashed.')
         try:
             return self._hash
         except AttributeError:
@@ -571,7 +576,7 @@ _FROZEN_ERROR = "Frozen FeatStructs may not be modified."
 _FROZEN_NOTICE = "\n%sIf self is frozen, raise ValueError."
 
 
-def _check_frozen(method, indent=""):
+def _check_frozen(method, indent=''):
     """
     Given a method function, return a new method function that first
     checks if ``self._frozen`` is true; and if so, raises ``ValueError``
@@ -586,7 +591,7 @@ def _check_frozen(method, indent=""):
             return method(self, *args, **kwargs)
 
     wrapped.__name__ = method.__name__
-    wrapped.__doc__ = (method.__doc__ or "") + (_FROZEN_NOTICE % indent)
+    wrapped.__doc__ = (method.__doc__ or '') + (_FROZEN_NOTICE % indent)
     return wrapped
 
 
@@ -595,7 +600,7 @@ def _check_frozen(method, indent=""):
 ######################################################################
 
 
-
+@python_2_unicode_compatible
 class FeatDict(FeatStruct, dict):
     """
     A feature structure that acts like a Python dictionary.  I.e., a
@@ -628,7 +633,7 @@ class FeatDict(FeatStruct, dict):
             ``morefeatures``, then the value from ``morefeatures`` will be
             used.
         """
-        if isinstance(features, str):
+        if isinstance(features, string_types):
             FeatStructReader().fromstring(features, self)
             self.update(**morefeatures)
         else:
@@ -643,7 +648,7 @@ class FeatDict(FeatStruct, dict):
     def __getitem__(self, name_or_path):
         """If the feature with the given name or path exists, return
         its value; otherwise, raise ``KeyError``."""
-        if isinstance(name_or_path, (str, Feature)):
+        if isinstance(name_or_path, (string_types, Feature)):
             return dict.__getitem__(self, name_or_path)
         elif isinstance(name_or_path, tuple):
             try:
@@ -683,7 +688,7 @@ class FeatDict(FeatStruct, dict):
         its value; otherwise, raise ``KeyError``."""
         if self._frozen:
             raise ValueError(_FROZEN_ERROR)
-        if isinstance(name_or_path, (str, Feature)):
+        if isinstance(name_or_path, (string_types, Feature)):
             return dict.__delitem__(self, name_or_path)
         elif isinstance(name_or_path, tuple):
             if len(name_or_path) == 0:
@@ -702,7 +707,7 @@ class FeatDict(FeatStruct, dict):
         ``KeyError``."""
         if self._frozen:
             raise ValueError(_FROZEN_ERROR)
-        if isinstance(name_or_path, (str, Feature)):
+        if isinstance(name_or_path, (string_types, Feature)):
             return dict.__setitem__(self, name_or_path, value)
         elif isinstance(name_or_path, tuple):
             if len(name_or_path) == 0:
@@ -725,20 +730,20 @@ class FeatDict(FeatStruct, dict):
             raise ValueError(_FROZEN_ERROR)
         if features is None:
             items = ()
-        elif hasattr(features, "items") and callable(features.items):
+        elif hasattr(features, 'items') and callable(features.items):
             items = features.items()
-        elif hasattr(features, "__iter__"):
+        elif hasattr(features, '__iter__'):
             items = features
         else:
-            raise ValueError("Expected mapping or list of tuples")
+            raise ValueError('Expected mapping or list of tuples')
 
         for key, val in items:
-            if not isinstance(key, (str, Feature)):
-                raise TypeError("Feature names must be strings")
+            if not isinstance(key, (string_types, Feature)):
+                raise TypeError('Feature names must be strings')
             self[key] = val
         for key, val in morefeatures.items():
-            if not isinstance(key, (str, Feature)):
-                raise TypeError("Feature names must be strings")
+            if not isinstance(key, (string_types, Feature)):
+                raise TypeError('Feature names must be strings')
             self[key] = val
 
     ##////////////////////////////////////////////////////////////
@@ -773,12 +778,12 @@ class FeatDict(FeatStruct, dict):
         Display a multi-line representation of this feature dictionary
         as an FVM (feature value matrix).
         """
-        return "\n".join(self._str(self._find_reentrances({}), {}))
+        return '\n'.join(self._str(self._find_reentrances({}), {}))
 
     def _repr(self, reentrances, reentrance_ids):
         segments = []
-        prefix = ""
-        suffix = ""
+        prefix = ''
+        suffix = ''
 
         # If this is the first time we've seen a reentrant structure,
         # then assign it a unique identifier.
@@ -789,37 +794,37 @@ class FeatDict(FeatStruct, dict):
         # sorting note: keys are unique strings, so we'll never fall
         # through to comparing values.
         for (fname, fval) in sorted(self.items()):
-            display = getattr(fname, "display", None)
+            display = getattr(fname, 'display', None)
             if id(fval) in reentrance_ids:
-                segments.append("%s->(%s)" % (fname, reentrance_ids[id(fval)]))
+                segments.append('%s->(%s)' % (fname, reentrance_ids[id(fval)]))
             elif (
-                display == "prefix"
+                display == 'prefix'
                 and not prefix
-                and isinstance(fval, (Variable, str))
+                and isinstance(fval, (Variable, string_types))
             ):
-                prefix = "%s" % fval
-            elif display == "slash" and not suffix:
+                prefix = '%s' % fval
+            elif display == 'slash' and not suffix:
                 if isinstance(fval, Variable):
-                    suffix = "/%s" % fval.name
+                    suffix = '/%s' % fval.name
                 else:
-                    suffix = "/%s" % repr(fval)
+                    suffix = '/%s' % unicode_repr(fval)
             elif isinstance(fval, Variable):
-                segments.append("%s=%s" % (fname, fval.name))
+                segments.append('%s=%s' % (fname, fval.name))
             elif fval is True:
-                segments.append("+%s" % fname)
+                segments.append('+%s' % fname)
             elif fval is False:
-                segments.append("-%s" % fname)
+                segments.append('-%s' % fname)
             elif isinstance(fval, Expression):
-                segments.append("%s=<%s>" % (fname, fval))
+                segments.append('%s=<%s>' % (fname, fval))
             elif not isinstance(fval, FeatStruct):
-                segments.append("%s=%s" % (fname, repr(fval)))
+                segments.append('%s=%s' % (fname, unicode_repr(fval)))
             else:
                 fval_repr = fval._repr(reentrances, reentrance_ids)
-                segments.append("%s=%s" % (fname, fval_repr))
+                segments.append('%s=%s' % (fname, fval_repr))
         # If it's reentrant, then add on an identifier tag.
         if reentrances[id(self)]:
-            prefix = "(%s)%s" % (reentrance_ids[id(self)], prefix)
-        return "%s[%s]%s" % (prefix, ", ".join(segments), suffix)
+            prefix = '(%s)%s' % (reentrance_ids[id(self)], prefix)
+        return '%s[%s]%s' % (prefix, ', '.join(segments), suffix)
 
     def _str(self, reentrances, reentrance_ids):
         """
@@ -843,9 +848,9 @@ class FeatDict(FeatStruct, dict):
         # Special case: empty feature dict.
         if len(self) == 0:
             if reentrances[id(self)]:
-                return ["(%s) []" % reentrance_ids[id(self)]]
+                return ['(%s) []' % reentrance_ids[id(self)]]
             else:
-                return ["[]"]
+                return ['[]']
 
         # What's the longest feature name?  Use this to align names.
         maxfnamelen = max(len("%s" % k) for k in self.keys())
@@ -856,60 +861,60 @@ class FeatDict(FeatStruct, dict):
         for (fname, fval) in sorted(self.items()):
             fname = ("%s" % fname).ljust(maxfnamelen)
             if isinstance(fval, Variable):
-                lines.append("%s = %s" % (fname, fval.name))
+                lines.append('%s = %s' % (fname, fval.name))
 
             elif isinstance(fval, Expression):
-                lines.append("%s = <%s>" % (fname, fval))
+                lines.append('%s = <%s>' % (fname, fval))
 
             elif isinstance(fval, FeatList):
                 fval_repr = fval._repr(reentrances, reentrance_ids)
-                lines.append("%s = %s" % (fname, repr(fval_repr)))
+                lines.append('%s = %s' % (fname, unicode_repr(fval_repr)))
 
             elif not isinstance(fval, FeatDict):
                 # It's not a nested feature structure -- just print it.
-                lines.append("%s = %s" % (fname, repr(fval)))
+                lines.append('%s = %s' % (fname, unicode_repr(fval)))
 
             elif id(fval) in reentrance_ids:
                 # It's a feature structure we've seen before -- print
                 # the reentrance id.
-                lines.append("%s -> (%s)" % (fname, reentrance_ids[id(fval)]))
+                lines.append('%s -> (%s)' % (fname, reentrance_ids[id(fval)]))
 
             else:
                 # It's a new feature structure.  Separate it from
                 # other values by a blank line.
-                if lines and lines[-1] != "":
-                    lines.append("")
+                if lines and lines[-1] != '':
+                    lines.append('')
 
                 # Recursively print the feature's value (fval).
                 fval_lines = fval._str(reentrances, reentrance_ids)
 
                 # Indent each line to make room for fname.
-                fval_lines = [(" " * (maxfnamelen + 3)) + l for l in fval_lines]
+                fval_lines = [(' ' * (maxfnamelen + 3)) + l for l in fval_lines]
 
                 # Pick which line we'll display fname on, & splice it in.
                 nameline = (len(fval_lines) - 1) // 2
                 fval_lines[nameline] = (
-                    fname + " =" + fval_lines[nameline][maxfnamelen + 2 :]
+                    fname + ' =' + fval_lines[nameline][maxfnamelen + 2 :]
                 )
 
                 # Add the feature structure to the output.
                 lines += fval_lines
 
                 # Separate FeatStructs by a blank line.
-                lines.append("")
+                lines.append('')
 
         # Get rid of any excess blank lines.
-        if lines[-1] == "":
+        if lines[-1] == '':
             lines.pop()
 
         # Add brackets around everything.
         maxlen = max(len(line) for line in lines)
-        lines = ["[ %s%s ]" % (line, " " * (maxlen - len(line))) for line in lines]
+        lines = ['[ %s%s ]' % (line, ' ' * (maxlen - len(line))) for line in lines]
 
         # If it's reentrant, then add on an identifier tag.
         if reentrances[id(self)]:
-            idstr = "(%s) " % reentrance_ids[id(self)]
-            lines = [(" " * len(idstr)) + l for l in lines]
+            idstr = '(%s) ' % reentrance_ids[id(self)]
+            lines = [(' ' * len(idstr)) + l for l in lines]
             idline = (len(lines) - 1) // 2
             lines[idline] = idstr + lines[idline][len(idstr) :]
 
@@ -947,7 +952,7 @@ class FeatList(FeatStruct, list):
             ``FeatStructReader``.  Otherwise, it should be a sequence
             of basic values and nested feature structures.
         """
-        if isinstance(features, str):
+        if isinstance(features, string_types):
             FeatStructReader().fromstring(features, self)
         else:
             list.__init__(self, features)
@@ -958,7 +963,7 @@ class FeatList(FeatStruct, list):
     _INDEX_ERROR = "Expected int or feature path.  Got %r."
 
     def __getitem__(self, name_or_path):
-        if isinstance(name_or_path, int):
+        if isinstance(name_or_path, integer_types):
             return list.__getitem__(self, name_or_path)
         elif isinstance(name_or_path, tuple):
             try:
@@ -978,7 +983,7 @@ class FeatList(FeatStruct, list):
         its value; otherwise, raise ``KeyError``."""
         if self._frozen:
             raise ValueError(_FROZEN_ERROR)
-        if isinstance(name_or_path, (int, slice)):
+        if isinstance(name_or_path, (integer_types, slice)):
             return list.__delitem__(self, name_or_path)
         elif isinstance(name_or_path, tuple):
             if len(name_or_path) == 0:
@@ -997,7 +1002,7 @@ class FeatList(FeatStruct, list):
         ``KeyError``."""
         if self._frozen:
             raise ValueError(_FROZEN_ERROR)
-        if isinstance(name_or_path, (int, slice)):
+        if isinstance(name_or_path, (integer_types, slice)):
             return list.__setitem__(self, name_or_path, value)
         elif isinstance(name_or_path, tuple):
             if len(name_or_path) == 0:
@@ -1055,24 +1060,24 @@ class FeatList(FeatStruct, list):
         if reentrances[id(self)]:
             assert id(self) not in reentrance_ids
             reentrance_ids[id(self)] = repr(len(reentrance_ids) + 1)
-            prefix = "(%s)" % reentrance_ids[id(self)]
+            prefix = '(%s)' % reentrance_ids[id(self)]
         else:
-            prefix = ""
+            prefix = ''
 
         segments = []
         for fval in self:
             if id(fval) in reentrance_ids:
-                segments.append("->(%s)" % reentrance_ids[id(fval)])
+                segments.append('->(%s)' % reentrance_ids[id(fval)])
             elif isinstance(fval, Variable):
                 segments.append(fval.name)
             elif isinstance(fval, Expression):
-                segments.append("%s" % fval)
+                segments.append('%s' % fval)
             elif isinstance(fval, FeatStruct):
                 segments.append(fval._repr(reentrances, reentrance_ids))
             else:
-                segments.append("%s" % repr(fval))
+                segments.append('%s' % unicode_repr(fval))
 
-        return "%s[%s]" % (prefix, ", ".join(segments))
+        return '%s[%s]' % (prefix, ', '.join(segments))
 
 
 ######################################################################
@@ -1080,7 +1085,7 @@ class FeatList(FeatStruct, list):
 ######################################################################
 
 
-def substitute_bindings(fstruct, bindings, fs_class="default"):
+def substitute_bindings(fstruct, bindings, fs_class='default'):
     """
     Return the feature structure that is obtained by replacing each
     variable bound by ``bindings`` with its binding.  If a variable is
@@ -1091,7 +1096,7 @@ def substitute_bindings(fstruct, bindings, fs_class="default"):
     :type bindings: dict(Variable -> any)
     :param bindings: A dictionary mapping from variables to values.
     """
-    if fs_class == "default":
+    if fs_class == 'default':
         fs_class = _default_fs_class(fstruct)
     fstruct = copy.deepcopy(fstruct)
     _substitute_bindings(fstruct, bindings, fs_class, set())
@@ -1109,7 +1114,7 @@ def _substitute_bindings(fstruct, bindings, fs_class, visited):
     elif _is_sequence(fstruct):
         items = enumerate(fstruct)
     else:
-        raise ValueError("Expected mapping or sequence")
+        raise ValueError('Expected mapping or sequence')
     for (fname, fval) in items:
         while isinstance(fval, Variable) and fval in bindings:
             fval = fstruct[fname] = bindings[fval]
@@ -1119,7 +1124,7 @@ def _substitute_bindings(fstruct, bindings, fs_class, visited):
             fstruct[fname] = fval.substitute_bindings(bindings)
 
 
-def retract_bindings(fstruct, bindings, fs_class="default"):
+def retract_bindings(fstruct, bindings, fs_class='default'):
     """
     Return the feature structure that is obtained by replacing each
     feature structure value that is bound by ``bindings`` with the
@@ -1131,7 +1136,7 @@ def retract_bindings(fstruct, bindings, fs_class="default"):
     values in ``bindings`` may be modified if they are contained in
     ``fstruct``.
     """
-    if fs_class == "default":
+    if fs_class == 'default':
         fs_class = _default_fs_class(fstruct)
     (fstruct, new_bindings) = copy.deepcopy((fstruct, bindings))
     bindings.update(new_bindings)
@@ -1151,7 +1156,7 @@ def _retract_bindings(fstruct, inv_bindings, fs_class, visited):
     elif _is_sequence(fstruct):
         items = enumerate(fstruct)
     else:
-        raise ValueError("Expected mapping or sequence")
+        raise ValueError('Expected mapping or sequence')
     for (fname, fval) in items:
         if isinstance(fval, fs_class):
             if id(fval) in inv_bindings:
@@ -1159,12 +1164,12 @@ def _retract_bindings(fstruct, inv_bindings, fs_class, visited):
             _retract_bindings(fval, inv_bindings, fs_class, visited)
 
 
-def find_variables(fstruct, fs_class="default"):
+def find_variables(fstruct, fs_class='default'):
     """
     :return: The set of variables used by this feature structure.
     :rtype: set(Variable)
     """
-    if fs_class == "default":
+    if fs_class == 'default':
         fs_class = _default_fs_class(fstruct)
     return _variables(fstruct, set(), fs_class, set())
 
@@ -1179,7 +1184,7 @@ def _variables(fstruct, vars, fs_class, visited):
     elif _is_sequence(fstruct):
         items = enumerate(fstruct)
     else:
-        raise ValueError("Expected mapping or sequence")
+        raise ValueError('Expected mapping or sequence')
     for (fname, fval) in items:
         if isinstance(fval, Variable):
             vars.add(fval)
@@ -1191,7 +1196,7 @@ def _variables(fstruct, vars, fs_class, visited):
 
 
 def rename_variables(
-    fstruct, vars=None, used_vars=(), new_vars=None, fs_class="default"
+    fstruct, vars=None, used_vars=(), new_vars=None, fs_class='default'
 ):
     """
     Return the feature structure that is obtained by replacing
@@ -1234,7 +1239,7 @@ def rename_variables(
 
     If new_vars is not specified, then an empty dictionary is used.
     """
-    if fs_class == "default":
+    if fs_class == 'default':
         fs_class = _default_fs_class(fstruct)
 
     # Default values:
@@ -1263,7 +1268,7 @@ def _rename_variables(fstruct, vars, used_vars, new_vars, fs_class, visited):
     elif _is_sequence(fstruct):
         items = enumerate(fstruct)
     else:
-        raise ValueError("Expected mapping or sequence")
+        raise ValueError('Expected mapping or sequence')
     for (fname, fval) in items:
         if isinstance(fval, Variable):
             # If it's in new_vars, then rebind it.
@@ -1288,21 +1293,21 @@ def _rename_variables(fstruct, vars, used_vars, new_vars, fs_class, visited):
 
 
 def _rename_variable(var, used_vars):
-    name, n = re.sub("\d+$", "", var.name), 2
+    name, n = re.sub('\d+$', '', var.name), 2
     if not name:
-        name = "?"
-    while Variable("%s%s" % (name, n)) in used_vars:
+        name = '?'
+    while Variable('%s%s' % (name, n)) in used_vars:
         n += 1
-    return Variable("%s%s" % (name, n))
+    return Variable('%s%s' % (name, n))
 
 
-def remove_variables(fstruct, fs_class="default"):
+def remove_variables(fstruct, fs_class='default'):
     """
     :rtype: FeatStruct
     :return: The feature structure that is obtained by deleting
         all features whose values are ``Variables``.
     """
-    if fs_class == "default":
+    if fs_class == 'default':
         fs_class = _default_fs_class(fstruct)
     return _remove_variables(copy.deepcopy(fstruct), fs_class, set())
 
@@ -1317,7 +1322,7 @@ def _remove_variables(fstruct, fs_class, visited):
     elif _is_sequence(fstruct):
         items = list(enumerate(fstruct))
     else:
-        raise ValueError("Expected mapping or sequence")
+        raise ValueError('Expected mapping or sequence')
 
     for (fname, fval) in items:
         if isinstance(fval, Variable):
@@ -1332,10 +1337,10 @@ def _remove_variables(fstruct, fs_class, visited):
 ######################################################################
 
 
-
+@python_2_unicode_compatible
 class _UnificationFailure(object):
     def __repr__(self):
-        return "nltk.featstruct.UnificationFailure"
+        return 'nltk.featstruct.UnificationFailure'
 
 
 UnificationFailure = _UnificationFailure()
@@ -1356,7 +1361,7 @@ def unify(
     trace=False,
     fail=None,
     rename_vars=True,
-    fs_class="default",
+    fs_class='default',
 ):
     """
     Unify ``fstruct1`` with ``fstruct2``, and return the resulting feature
@@ -1402,7 +1407,7 @@ def unify(
     """
     # Decide which class(es) will be treated as feature structures,
     # for the purposes of unification.
-    if fs_class == "default":
+    if fs_class == 'default':
         fs_class = _default_fs_class(fstruct1)
         if _default_fs_class(fstruct2) != fs_class:
             raise ValueError(
@@ -1517,10 +1522,10 @@ def _destructively_unify(
     # Unifying two mappings:
     if _is_mapping(fstruct1) and _is_mapping(fstruct2):
         for fname in fstruct1:
-            if getattr(fname, "default", None) is not None:
+            if getattr(fname, 'default', None) is not None:
                 fstruct2.setdefault(fname, fname.default)
         for fname in fstruct2:
-            if getattr(fname, "default", None) is not None:
+            if getattr(fname, 'default', None) is not None:
                 fstruct1.setdefault(fname, fname.default)
 
         # Unify any values that are defined in both fstruct1 and
@@ -1576,7 +1581,7 @@ def _destructively_unify(
         return UnificationFailure
 
     # Unifying anything else: not allowed!
-    raise TypeError("Expected mappings or sequences")
+    raise TypeError('Expected mappings or sequences')
 
 
 def _unify_feature_values(
@@ -1654,8 +1659,8 @@ def _unify_feature_values(
             # Sanity check: unify value should be symmetric
             if isinstance(fval2, CustomFeatureValue) and result != fval2.unify(fval1):
                 raise AssertionError(
-                    "CustomFeatureValue objects %r and %r disagree "
-                    "about unification value: %r vs. %r"
+                    'CustomFeatureValue objects %r and %r disagree '
+                    'about unification value: %r vs. %r'
                     % (fval1, fval2, result, fval2.unify(fval1))
                 )
         elif isinstance(fval2, CustomFeatureValue):
@@ -1730,7 +1735,7 @@ def _apply_forwards(fstruct, forward, fs_class, visited):
     elif _is_sequence(fstruct):
         items = enumerate(fstruct)
     else:
-        raise ValueError("Expected mapping or sequence")
+        raise ValueError('Expected mapping or sequence')
     for fname, fval in items:
         if isinstance(fval, fs_class):
             # Replace w/ forwarded value.
@@ -1755,52 +1760,52 @@ def _resolve_aliases(bindings):
 
 def _trace_unify_start(path, fval1, fval2):
     if path == ():
-        print("\nUnification trace:")
+        print('\nUnification trace:')
     else:
-        fullname = ".".join("%s" % n for n in path)
-        print("  " + "|   " * (len(path) - 1) + "|")
-        print("  " + "|   " * (len(path) - 1) + "| Unify feature: %s" % fullname)
-    print("  " + "|   " * len(path) + " / " + _trace_valrepr(fval1))
-    print("  " + "|   " * len(path) + "|\\ " + _trace_valrepr(fval2))
+        fullname = '.'.join("%s" % n for n in path)
+        print('  ' + '|   ' * (len(path) - 1) + '|')
+        print('  ' + '|   ' * (len(path) - 1) + '| Unify feature: %s' % fullname)
+    print('  ' + '|   ' * len(path) + ' / ' + _trace_valrepr(fval1))
+    print('  ' + '|   ' * len(path) + '|\\ ' + _trace_valrepr(fval2))
 
 
 def _trace_unify_identity(path, fval1):
-    print("  " + "|   " * len(path) + "|")
-    print("  " + "|   " * len(path) + "| (identical objects)")
-    print("  " + "|   " * len(path) + "|")
-    print("  " + "|   " * len(path) + "+-->" + repr(fval1))
+    print('  ' + '|   ' * len(path) + '|')
+    print('  ' + '|   ' * len(path) + '| (identical objects)')
+    print('  ' + '|   ' * len(path) + '|')
+    print('  ' + '|   ' * len(path) + '+-->' + unicode_repr(fval1))
 
 
 def _trace_unify_fail(path, result):
     if result is UnificationFailure:
-        resume = ""
+        resume = ''
     else:
-        resume = " (nonfatal)"
-    print("  " + "|   " * len(path) + "|   |")
-    print("  " + "X   " * len(path) + "X   X <-- FAIL" + resume)
+        resume = ' (nonfatal)'
+    print('  ' + '|   ' * len(path) + '|   |')
+    print('  ' + 'X   ' * len(path) + 'X   X <-- FAIL' + resume)
 
 
 def _trace_unify_succeed(path, fval1):
     # Print the result.
-    print("  " + "|   " * len(path) + "|")
-    print("  " + "|   " * len(path) + "+-->" + repr(fval1))
+    print('  ' + '|   ' * len(path) + '|')
+    print('  ' + '|   ' * len(path) + '+-->' + unicode_repr(fval1))
 
 
 def _trace_bindings(path, bindings):
     # Print the bindings (if any).
     if len(bindings) > 0:
         binditems = sorted(bindings.items(), key=lambda v: v[0].name)
-        bindstr = "{%s}" % ", ".join(
-            "%s: %s" % (var, _trace_valrepr(val)) for (var, val) in binditems
+        bindstr = '{%s}' % ', '.join(
+            '%s: %s' % (var, _trace_valrepr(val)) for (var, val) in binditems
         )
-        print("  " + "|   " * len(path) + "    Bindings: " + bindstr)
+        print('  ' + '|   ' * len(path) + '    Bindings: ' + bindstr)
 
 
 def _trace_valrepr(val):
     if isinstance(val, Variable):
-        return "%s" % val
+        return '%s' % val
     else:
-        return "%s" % repr(val)
+        return '%s' % unicode_repr(val)
 
 
 def subsumes(fstruct1, fstruct2):
@@ -1837,14 +1842,14 @@ def conflicts(fstruct1, fstruct2, trace=0):
 
 
 def _is_mapping(v):
-    return hasattr(v, "__contains__") and hasattr(v, "keys")
+    return hasattr(v, '__contains__') and hasattr(v, 'keys')
 
 
 def _is_sequence(v):
     return (
-        hasattr(v, "__iter__")
-        and hasattr(v, "__len__")
-        and not isinstance(v, str)
+        hasattr(v, '__iter__')
+        and hasattr(v, '__len__')
+        and not isinstance(v, string_types)
     )
 
 
@@ -1855,8 +1860,8 @@ def _default_fs_class(obj):
         return (dict, list)
     else:
         raise ValueError(
-            "To unify objects of type %s, you must specify "
-            "fs_class explicitly." % obj.__class__.__name__
+            'To unify objects of type %s, you must specify '
+            'fs_class explicitly.' % obj.__class__.__name__
         )
 
 
@@ -1891,7 +1896,7 @@ class SubstituteBindingsSequence(SubstituteBindingsI):
             return bindings.get(v, v)
 
 
-
+@python_2_unicode_compatible
 class FeatureValueTuple(SubstituteBindingsSequence, tuple):
     """
     A base feature value that is a tuple of other base feature values.
@@ -1902,11 +1907,11 @@ class FeatureValueTuple(SubstituteBindingsSequence, tuple):
 
     def __repr__(self):  # [xx] really use %s here?
         if len(self) == 0:
-            return "()"
-        return "(%s)" % ", ".join("%s" % (b,) for b in self)
-
+            return '()'
+        return '(%s)' % ', '.join('%s' % (b,) for b in self)
 
 
+@python_2_unicode_compatible
 class FeatureValueSet(SubstituteBindingsSequence, frozenset):
     """
     A base feature value that is a set of other base feature values.
@@ -1917,15 +1922,15 @@ class FeatureValueSet(SubstituteBindingsSequence, frozenset):
 
     def __repr__(self):  # [xx] really use %s here?
         if len(self) == 0:
-            return "{/}"  # distinguish from dict.
+            return '{/}'  # distinguish from dict.
         # n.b., we sort the string reprs of our elements, to ensure
         # that our own repr is deterministic.
-        return "{%s}" % ", ".join(sorted("%s" % (b,) for b in self))
+        return '{%s}' % ', '.join(sorted('%s' % (b,) for b in self))
 
     __str__ = __repr__
 
 
-
+@python_2_unicode_compatible
 class FeatureValueUnion(SubstituteBindingsSequence, frozenset):
     """
     A base feature value that represents the union of two or more
@@ -1953,10 +1958,10 @@ class FeatureValueUnion(SubstituteBindingsSequence, frozenset):
         # n.b., we sort the string reprs of our elements, to ensure
         # that our own repr is deterministic.  also, note that len(self)
         # is guaranteed to be 2 or more.
-        return "{%s}" % "+".join(sorted("%s" % (b,) for b in self))
-
+        return '{%s}' % '+'.join(sorted('%s' % (b,) for b in self))
 
 
+@python_2_unicode_compatible
 class FeatureValueConcat(SubstituteBindingsSequence, tuple):
     """
     A base feature value that represents the concatenation of two or
@@ -1982,7 +1987,7 @@ class FeatureValueConcat(SubstituteBindingsSequence, tuple):
 
     def __repr__(self):
         # n.b.: len(self) is guaranteed to be 2 or more.
-        return "(%s)" % "+".join("%s" % (b,) for b in self)
+        return '(%s)' % '+'.join('%s' % (b,) for b in self)
 
 
 def _flatten(lst, cls):
@@ -2005,7 +2010,7 @@ def _flatten(lst, cls):
 
 
 @total_ordering
-
+@python_2_unicode_compatible
 class Feature(object):
     """
     A feature identifier that's specialized to put additional
@@ -2013,15 +2018,15 @@ class Feature(object):
     """
 
     def __init__(self, name, default=None, display=None):
-        assert display in (None, "prefix", "slash")
+        assert display in (None, 'prefix', 'slash')
 
         self._name = name  # [xx] rename to .identifier?
         self._default = default  # [xx] not implemented yet.
         self._display = display
 
-        if self._display == "prefix":
+        if self._display == 'prefix':
             self._sortkey = (-1, self._name)
-        elif self._display == "slash":
+        elif self._display == 'slash':
             self._sortkey = (1, self._name)
         else:
             self._sortkey = (0, self._name)
@@ -2042,10 +2047,10 @@ class Feature(object):
         return self._display
 
     def __repr__(self):
-        return "*%s*" % self.name
+        return '*%s*' % self.name
 
     def __lt__(self, other):
-        if isinstance(other, str):
+        if isinstance(other, string_types):
             return True
         if not isinstance(other, Feature):
             raise_unorderable_types("<", self, other)
@@ -2084,12 +2089,12 @@ class SlashFeature(Feature):
 
 
 class RangeFeature(Feature):
-    RANGE_RE = re.compile("(-?\d+):(-?\d+)")
+    RANGE_RE = re.compile('(-?\d+):(-?\d+)')
 
     def read_value(self, s, position, reentrances, parser):
         m = self.RANGE_RE.match(s, position)
         if not m:
-            raise ValueError("range", position)
+            raise ValueError('range', position)
         return (int(m.group(1)), int(m.group(2))), m.end()
 
     def unify_base_values(self, fval1, fval2, bindings):
@@ -2103,8 +2108,8 @@ class RangeFeature(Feature):
         return rng
 
 
-SLASH = SlashFeature("slash", default=False, display="slash")
-TYPE = Feature("type", display="prefix")
+SLASH = SlashFeature('slash', default=False, display='slash')
+TYPE = Feature('type', display='prefix')
 
 
 ######################################################################
@@ -2137,19 +2142,19 @@ class CustomFeatureValue(object):
         If this base value unifies with ``other``, then return the
         unified value.  Otherwise, return ``UnificationFailure``.
         """
-        raise NotImplementedError("abstract base class")
+        raise NotImplementedError('abstract base class')
 
     def __eq__(self, other):
-        raise NotImplementedError("abstract base class")
+        raise NotImplementedError('abstract base class')
 
     def __ne__(self, other):
         return not self == other
 
     def __lt__(self, other):
-        raise NotImplementedError("abstract base class")
+        raise NotImplementedError('abstract base class')
 
     def __hash__(self):
-        raise TypeError("%s objects or unhashable" % self.__class__.__name__)
+        raise TypeError('%s objects or unhashable' % self.__class__.__name__)
 
 
 ######################################################################
@@ -2171,13 +2176,13 @@ class FeatStructReader(object):
         self._prefix_feature = None
         self._slash_feature = None
         for feature in features:
-            if feature.display == "slash":
+            if feature.display == 'slash':
                 if self._slash_feature:
-                    raise ValueError("Multiple features w/ display=slash")
+                    raise ValueError('Multiple features w/ display=slash')
                 self._slash_feature = feature
-            if feature.display == "prefix":
+            if feature.display == 'prefix':
                 if self._prefix_feature:
-                    raise ValueError("Multiple features w/ display=prefix")
+                    raise ValueError('Multiple features w/ display=prefix')
                 self._prefix_feature = feature
         self._features_with_defaults = [
             feature for feature in features if feature.default is not None
@@ -2208,21 +2213,21 @@ class FeatStructReader(object):
         s = s.strip()
         value, position = self.read_partial(s, 0, {}, fstruct)
         if position != len(s):
-            self._error(s, "end of string", position)
+            self._error(s, 'end of string', position)
         return value
 
-    _START_FSTRUCT_RE = re.compile(r"\s*(?:\((\d+)\)\s*)?(\??[\w-]+)?(\[)")
-    _END_FSTRUCT_RE = re.compile(r"\s*]\s*")
-    _SLASH_RE = re.compile(r"/")
+    _START_FSTRUCT_RE = re.compile(r'\s*(?:\((\d+)\)\s*)?(\??[\w-]+)?(\[)')
+    _END_FSTRUCT_RE = re.compile(r'\s*]\s*')
+    _SLASH_RE = re.compile(r'/')
     _FEATURE_NAME_RE = re.compile(r'\s*([+-]?)([^\s\(\)<>"\'\-=\[\],]+)\s*')
-    _REENTRANCE_RE = re.compile(r"\s*->\s*")
-    _TARGET_RE = re.compile(r"\s*\((\d+)\)\s*")
-    _ASSIGN_RE = re.compile(r"\s*=\s*")
-    _COMMA_RE = re.compile(r"\s*,\s*")
-    _BARE_PREFIX_RE = re.compile(r"\s*(?:\((\d+)\)\s*)?(\??[\w-]+\s*)()")
+    _REENTRANCE_RE = re.compile(r'\s*->\s*')
+    _TARGET_RE = re.compile(r'\s*\((\d+)\)\s*')
+    _ASSIGN_RE = re.compile(r'\s*=\s*')
+    _COMMA_RE = re.compile(r'\s*,\s*')
+    _BARE_PREFIX_RE = re.compile(r'\s*(?:\((\d+)\)\s*)?(\??[\w-]+\s*)()')
     # This one is used to distinguish fdicts from flists:
     _START_FDICT_RE = re.compile(
-        r"(%s)|(%s\s*(%s\s*(=|->)|[+-]%s|\]))"
+        r'(%s)|(%s\s*(%s\s*(=|->)|[+-]%s|\]))'
         % (
             _BARE_PREFIX_RE.pattern,
             _START_FSTRUCT_RE.pattern,
@@ -2265,14 +2270,14 @@ class FeatStructReader(object):
         if not match:
             match = self._BARE_PREFIX_RE.match(s, position)
             if not match:
-                raise ValueError("open bracket or identifier", position)
+                raise ValueError('open bracket or identifier', position)
         position = match.end()
 
         # If there as an identifier, record it.
         if match.group(1):
             identifier = match.group(1)
             if identifier in reentrances:
-                raise ValueError("new identifier", match.start(1))
+                raise ValueError('new identifier', match.start(1))
             reentrances[identifier] = fstruct
 
         if isinstance(fstruct, FeatDict):
@@ -2285,10 +2290,10 @@ class FeatStructReader(object):
     def _read_partial_featlist(self, s, position, match, reentrances, fstruct):
         # Prefix features are not allowed:
         if match.group(2):
-            raise ValueError("open bracket")
+            raise ValueError('open bracket')
         # Bare prefixes are not allowed:
         if not match.group(3):
-            raise ValueError("open bracket")
+            raise ValueError('open bracket')
 
         # Build a list of the features defined by the structure.
         while position < len(s):
@@ -2303,10 +2308,10 @@ class FeatStructReader(object):
                 position = match.end()
                 match = self._TARGET_RE.match(s, position)
                 if not match:
-                    raise ValueError("identifier", position)
+                    raise ValueError('identifier', position)
                 target = match.group(1)
                 if target not in reentrances:
-                    raise ValueError("bound identifier", position)
+                    raise ValueError('bound identifier', position)
                 position = match.end()
                 fstruct.append(reentrances[target])
 
@@ -2322,19 +2327,19 @@ class FeatStructReader(object):
             # Otherwise, there should be a comma
             match = self._COMMA_RE.match(s, position)
             if match is None:
-                raise ValueError("comma", position)
+                raise ValueError('comma', position)
             position = match.end()
 
         # We never saw a close bracket.
-        raise ValueError("close bracket", position)
+        raise ValueError('close bracket', position)
 
     def _read_partial_featdict(self, s, position, match, reentrances, fstruct):
         # If there was a prefix feature, record it.
         if match.group(2):
             if self._prefix_feature is None:
-                raise ValueError("open bracket or identifier", match.start(2))
+                raise ValueError('open bracket or identifier', match.start(2))
             prefixval = match.group(2).strip()
-            if prefixval.startswith("?"):
+            if prefixval.startswith('?'):
                 prefixval = Variable(prefixval)
             fstruct[self._prefix_feature] = prefixval
 
@@ -2361,24 +2366,24 @@ class FeatStructReader(object):
             # Get the feature name's name
             match = self._FEATURE_NAME_RE.match(s, position)
             if match is None:
-                raise ValueError("feature name", position)
+                raise ValueError('feature name', position)
             name = match.group(2)
             position = match.end()
 
             # Check if it's a special feature.
-            if name[0] == "*" and name[-1] == "*":
+            if name[0] == '*' and name[-1] == '*':
                 name = self._features.get(name[1:-1])
                 if name is None:
-                    raise ValueError("known special feature", match.start(2))
+                    raise ValueError('known special feature', match.start(2))
 
             # Check if this feature has a value already.
             if name in fstruct:
-                raise ValueError("new name", match.start(2))
+                raise ValueError('new name', match.start(2))
 
             # Boolean value ("+name" or "-name")
-            if match.group(1) == "+":
+            if match.group(1) == '+':
                 value = True
-            if match.group(1) == "-":
+            if match.group(1) == '-':
                 value = False
 
             # Reentrance link ("-> (target)")
@@ -2388,10 +2393,10 @@ class FeatStructReader(object):
                     position = match.end()
                     match = self._TARGET_RE.match(s, position)
                     if not match:
-                        raise ValueError("identifier", position)
+                        raise ValueError('identifier', position)
                     target = match.group(1)
                     if target not in reentrances:
-                        raise ValueError("bound identifier", position)
+                        raise ValueError('bound identifier', position)
                     position = match.end()
                     value = reentrances[target]
 
@@ -2403,7 +2408,7 @@ class FeatStructReader(object):
                     value, position = self._read_value(name, s, position, reentrances)
                 # None of the above: error.
                 else:
-                    raise ValueError("equals sign", position)
+                    raise ValueError('equals sign', position)
 
             # Store the value.
             fstruct[name] = value
@@ -2415,11 +2420,11 @@ class FeatStructReader(object):
             # Otherwise, there should be a comma
             match = self._COMMA_RE.match(s, position)
             if match is None:
-                raise ValueError("comma", position)
+                raise ValueError('comma', position)
             position = match.end()
 
         # We never saw a close bracket.
-        raise ValueError("close bracket", position)
+        raise ValueError('close bracket', position)
 
     def _finalize(self, s, pos, reentrances, fstruct):
         """
@@ -2450,19 +2455,19 @@ class FeatStructReader(object):
             if match:
                 handler_func = getattr(self, handler)
                 return handler_func(s, position, reentrances, match)
-        raise ValueError("value", position)
+        raise ValueError('value', position)
 
     def _error(self, s, expected, position):
-        lines = s.split("\n")
+        lines = s.split('\n')
         while position > len(lines[0]):
             position -= len(lines.pop(0)) + 1  # +1 for the newline.
         estr = (
-            "Error parsing feature structure\n    "
+            'Error parsing feature structure\n    '
             + lines[0]
-            + "\n    "
-            + " " * position
-            + "^ "
-            + "Expected %s" % expected
+            + '\n    '
+            + ' ' * position
+            + '^ '
+            + 'Expected %s' % expected
         )
         raise ValueError(estr)
 
@@ -2481,20 +2486,20 @@ class FeatStructReader(object):
     #: the string position where the value ended.  (n.b.: order is
     #: important here!)
     VALUE_HANDLERS = [
-        ("read_fstruct_value", _START_FSTRUCT_RE),
-        ("read_var_value", re.compile(r"\?[a-zA-Z_][a-zA-Z0-9_]*")),
-        ("read_str_value", re.compile("[uU]?[rR]?(['\"])")),
-        ("read_int_value", re.compile(r"-?\d+")),
-        ("read_sym_value", re.compile(r"[a-zA-Z_][a-zA-Z0-9_]*")),
+        ('read_fstruct_value', _START_FSTRUCT_RE),
+        ('read_var_value', re.compile(r'\?[a-zA-Z_][a-zA-Z0-9_]*')),
+        ('read_str_value', re.compile("[uU]?[rR]?(['\"])")),
+        ('read_int_value', re.compile(r'-?\d+')),
+        ('read_sym_value', re.compile(r'[a-zA-Z_][a-zA-Z0-9_]*')),
         (
-            "read_app_value",
-            re.compile(r"<(app)\((\?[a-z][a-z]*)\s*," r"\s*(\?[a-z][a-z]*)\)>"),
+            'read_app_value',
+            re.compile(r'<(app)\((\?[a-z][a-z]*)\s*,' r'\s*(\?[a-z][a-z]*)\)>'),
         ),
         #       ('read_logic_value', re.compile(r'<([^>]*)>')),
         # lazily match any character after '<' until we hit a '>' not preceded by '-'
-        ("read_logic_value", re.compile(r"<(.*?)(?<!-)>")),
-        ("read_set_value", re.compile(r"{")),
-        ("read_tuple_value", re.compile(r"\(")),
+        ('read_logic_value', re.compile(r'<(.*?)(?<!-)>')),
+        ('read_set_value', re.compile(r'{')),
+        ('read_tuple_value', re.compile(r'\(')),
     ]
 
     def read_fstruct_value(self, s, position, reentrances, match):
@@ -2510,7 +2515,7 @@ class FeatStructReader(object):
     def read_var_value(self, s, position, reentrances, match):
         return Variable(match.group()), match.end()
 
-    _SYM_CONSTS = {"None": None, "True": True, "False": False}
+    _SYM_CONSTS = {'None': None, 'True': True, 'False': False}
 
     def read_sym_value(self, s, position, reentrances, match):
         val, end = match.group(), match.end()
@@ -2518,7 +2523,7 @@ class FeatStructReader(object):
 
     def read_app_value(self, s, position, reentrances, match):
         """Mainly included for backwards compat."""
-        return self._logic_parser.parse("%s(%s)" % match.group(2, 3)), match.end()
+        return self._logic_parser.parse('%s(%s)' % match.group(2, 3)), match.end()
 
     def read_logic_value(self, s, position, reentrances, match):
         try:
@@ -2528,16 +2533,16 @@ class FeatStructReader(object):
                 raise ValueError()
             return expr, match.end()
         except ValueError:
-            raise ValueError("logic expression", match.start(1))
+            raise ValueError('logic expression', match.start(1))
 
     def read_tuple_value(self, s, position, reentrances, match):
         return self._read_seq_value(
-            s, position, reentrances, match, ")", FeatureValueTuple, FeatureValueConcat
+            s, position, reentrances, match, ')', FeatureValueTuple, FeatureValueConcat
         )
 
     def read_set_value(self, s, position, reentrances, match):
         return self._read_seq_value(
-            s, position, reentrances, match, "}", FeatureValueSet, FeatureValueUnion
+            s, position, reentrances, match, '}', FeatureValueSet, FeatureValueUnion
         )
 
     def _read_seq_value(
@@ -2549,7 +2554,7 @@ class FeatStructReader(object):
         cp = re.escape(close_paren)
         position = match.end()
         # Special syntax fo empty tuples:
-        m = re.compile(r"\s*/?\s*%s" % cp).match(s, position)
+        m = re.compile(r'\s*/?\s*%s' % cp).match(s, position)
         if m:
             return seq_class(), m.end()
         # Read values:
@@ -2557,7 +2562,7 @@ class FeatStructReader(object):
         seen_plus = False
         while True:
             # Close paren: return value.
-            m = re.compile(r"\s*%s" % cp).match(s, position)
+            m = re.compile(r'\s*%s' % cp).match(s, position)
             if m:
                 if seen_plus:
                     return plus_class(values), m.end()
@@ -2569,10 +2574,10 @@ class FeatStructReader(object):
             values.append(val)
 
             # Comma or looking at close paren
-            m = re.compile(r"\s*(,|\+|(?=%s))\s*" % cp).match(s, position)
+            m = re.compile(r'\s*(,|\+|(?=%s))\s*' % cp).match(s, position)
             if not m:
                 raise ValueError("',' or '+' or '%s'" % cp, position)
-            if m.group(1) == "+":
+            if m.group(1) == '+':
                 seen_plus = True
             position = m.end()
 
@@ -2582,34 +2587,34 @@ class FeatStructReader(object):
 ######################################################################
 
 
-def display_unification(fs1, fs2, indent="  "):
+def display_unification(fs1, fs2, indent='  '):
     # Print the two input feature structures, side by side.
-    fs1_lines = ("%s" % fs1).split("\n")
-    fs2_lines = ("%s" % fs2).split("\n")
+    fs1_lines = ("%s" % fs1).split('\n')
+    fs2_lines = ("%s" % fs2).split('\n')
     if len(fs1_lines) > len(fs2_lines):
-        blankline = "[" + " " * (len(fs2_lines[0]) - 2) + "]"
+        blankline = '[' + ' ' * (len(fs2_lines[0]) - 2) + ']'
         fs2_lines += [blankline] * len(fs1_lines)
     else:
-        blankline = "[" + " " * (len(fs1_lines[0]) - 2) + "]"
+        blankline = '[' + ' ' * (len(fs1_lines[0]) - 2) + ']'
         fs1_lines += [blankline] * len(fs2_lines)
     for (fs1_line, fs2_line) in zip(fs1_lines, fs2_lines):
-        print(indent + fs1_line + "   " + fs2_line)
-    print(indent + "-" * len(fs1_lines[0]) + "   " + "-" * len(fs2_lines[0]))
+        print(indent + fs1_line + '   ' + fs2_line)
+    print(indent + '-' * len(fs1_lines[0]) + '   ' + '-' * len(fs2_lines[0]))
 
     linelen = len(fs1_lines[0]) * 2 + 3
-    print(indent + "|               |".center(linelen))
-    print(indent + "+-----UNIFY-----+".center(linelen))
-    print(indent + "|".center(linelen))
-    print(indent + "V".center(linelen))
+    print(indent + '|               |'.center(linelen))
+    print(indent + '+-----UNIFY-----+'.center(linelen))
+    print(indent + '|'.center(linelen))
+    print(indent + 'V'.center(linelen))
 
     bindings = {}
 
     result = fs1.unify(fs2, bindings)
     if result is None:
-        print(indent + "(FAILED)".center(linelen))
+        print(indent + '(FAILED)'.center(linelen))
     else:
         print(
-            "\n".join(indent + l.center(linelen) for l in ("%s" % result).split("\n"))
+            '\n'.join(indent + l.center(linelen) for l in ("%s" % result).split('\n'))
         )
         if bindings and len(bindings.bound_variables()) > 0:
             print(repr(bindings).center(linelen))
@@ -2619,16 +2624,16 @@ def display_unification(fs1, fs2, indent="  "):
 def interactive_demo(trace=False):
     import random, sys
 
-    HELP = """
+    HELP = '''
     1-%d: Select the corresponding feature structure
     q: Quit
     t: Turn tracing on or off
     l: List all feature structures
     ?: Help
-    """
+    '''
 
     print(
-        """
+        '''
     This demo will repeatedly present you with a list of feature
     structures, and ask you to choose two for unification.  Whenever a
     new feature structure is generated, it is added to the list of
@@ -2637,26 +2642,26 @@ def interactive_demo(trace=False):
     random subset for you to choose between at a given time.  If you
     want to see the complete lists, type "l".  For a list of valid
     commands, type "?".
-    """
+    '''
     )
     print('Press "Enter" to continue...')
     sys.stdin.readline()
 
     fstruct_strings = [
-        "[agr=[number=sing, gender=masc]]",
-        "[agr=[gender=masc, person=3]]",
-        "[agr=[gender=fem, person=3]]",
-        "[subj=[agr=(1)[]], agr->(1)]",
-        "[obj=?x]",
-        "[subj=?x]",
-        "[/=None]",
-        "[/=NP]",
-        "[cat=NP]",
-        "[cat=VP]",
-        "[cat=PP]",
-        "[subj=[agr=[gender=?y]], obj=[agr=[gender=?y]]]",
-        "[gender=masc, agr=?C]",
-        "[gender=?S, agr=[gender=?S,person=3]]",
+        '[agr=[number=sing, gender=masc]]',
+        '[agr=[gender=masc, person=3]]',
+        '[agr=[gender=fem, person=3]]',
+        '[subj=[agr=(1)[]], agr->(1)]',
+        '[obj=?x]',
+        '[subj=?x]',
+        '[/=None]',
+        '[/=NP]',
+        '[cat=NP]',
+        '[cat=VP]',
+        '[cat=PP]',
+        '[subj=[agr=[gender=?y]], obj=[agr=[gender=?y]]]',
+        '[gender=masc, agr=?C]',
+        '[gender=?S, agr=[gender=?S,person=3]]',
     ]
 
     all_fstructs = [
@@ -2666,10 +2671,10 @@ def interactive_demo(trace=False):
     def list_fstructs(fstructs):
         for i, fstruct in fstructs:
             print()
-            lines = ("%s" % fstruct).split("\n")
-            print("%3d: %s" % (i + 1, lines[0]))
+            lines = ("%s" % fstruct).split('\n')
+            print('%3d: %s' % (i + 1, lines[0]))
             for line in lines[1:]:
-                print("     " + line)
+                print('     ' + line)
         print()
 
     while True:
@@ -2680,40 +2685,40 @@ def interactive_demo(trace=False):
         else:
             fstructs = all_fstructs
 
-        print("_" * 75)
+        print('_' * 75)
 
-        print("Choose two feature structures to unify:")
+        print('Choose two feature structures to unify:')
         list_fstructs(fstructs)
 
         selected = [None, None]
-        for (nth, i) in (("First", 0), ("Second", 1)):
+        for (nth, i) in (('First', 0), ('Second', 1)):
             while selected[i] is None:
                 print(
                     (
-                        "%s feature structure (1-%d,q,t,l,?): "
+                        '%s feature structure (1-%d,q,t,l,?): '
                         % (nth, len(all_fstructs))
                     ),
-                    end=" ",
+                    end=' ',
                 )
                 try:
                     input = sys.stdin.readline().strip()
-                    if input in ("q", "Q", "x", "X"):
+                    if input in ('q', 'Q', 'x', 'X'):
                         return
-                    if input in ("t", "T"):
+                    if input in ('t', 'T'):
                         trace = not trace
-                        print("   Trace = %s" % trace)
+                        print('   Trace = %s' % trace)
                         continue
-                    if input in ("h", "H", "?"):
+                    if input in ('h', 'H', '?'):
                         print(HELP % len(fstructs))
                         continue
-                    if input in ("l", "L"):
+                    if input in ('l', 'L'):
                         list_fstructs(all_fstructs)
                         continue
                     num = int(input) - 1
                     selected[i] = all_fstructs[num][1]
                     print()
                 except:
-                    print("Bad sentence number")
+                    print('Bad sentence number')
                     continue
 
         if trace:
@@ -2729,7 +2734,7 @@ def interactive_demo(trace=False):
 
         print('\nType "Enter" to continue unifying; or "q" to quit.')
         input = sys.stdin.readline().strip()
-        if input in ("q", "Q", "x", "X"):
+        if input in ('q', 'Q', 'x', 'X'):
             return
 
 
@@ -2741,20 +2746,20 @@ def demo(trace=False):
 
     # processor breaks with values like '3rd'
     fstruct_strings = [
-        "[agr=[number=sing, gender=masc]]",
-        "[agr=[gender=masc, person=3]]",
-        "[agr=[gender=fem, person=3]]",
-        "[subj=[agr=(1)[]], agr->(1)]",
-        "[obj=?x]",
-        "[subj=?x]",
-        "[/=None]",
-        "[/=NP]",
-        "[cat=NP]",
-        "[cat=VP]",
-        "[cat=PP]",
-        "[subj=[agr=[gender=?y]], obj=[agr=[gender=?y]]]",
-        "[gender=masc, agr=?C]",
-        "[gender=?S, agr=[gender=?S,person=3]]",
+        '[agr=[number=sing, gender=masc]]',
+        '[agr=[gender=masc, person=3]]',
+        '[agr=[gender=fem, person=3]]',
+        '[subj=[agr=(1)[]], agr->(1)]',
+        '[obj=?x]',
+        '[subj=?x]',
+        '[/=None]',
+        '[/=NP]',
+        '[cat=NP]',
+        '[cat=VP]',
+        '[cat=PP]',
+        '[subj=[agr=[gender=?y]], obj=[agr=[gender=?y]]]',
+        '[gender=masc, agr=?C]',
+        '[gender=?S, agr=[gender=?S,person=3]]',
     ]
     all_fstructs = [FeatStruct(fss) for fss in fstruct_strings]
     # MAX_CHOICES = 5
@@ -2772,20 +2777,20 @@ def demo(trace=False):
             )
 
 
-if __name__ == "__main__":
+if __name__ == '__main__':
     demo()
 
 __all__ = [
-    "FeatStruct",
-    "FeatDict",
-    "FeatList",
-    "unify",
-    "subsumes",
-    "conflicts",
-    "Feature",
-    "SlashFeature",
-    "RangeFeature",
-    "SLASH",
-    "TYPE",
-    "FeatStructReader",
+    'FeatStruct',
+    'FeatDict',
+    'FeatList',
+    'unify',
+    'subsumes',
+    'conflicts',
+    'Feature',
+    'SlashFeature',
+    'RangeFeature',
+    'SLASH',
+    'TYPE',
+    'FeatStructReader',
 ]
index c6c7a69..5ada3cf 100644 (file)
@@ -1,7 +1,7 @@
 # -*- coding: utf-8 -*-
 # Natural Language Toolkit: Context Free Grammars
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Steven Bird <stevenbird1@gmail.com>
 #         Edward Loper <edloper@gmail.com>
 #         Jason Narad <jason.narad@gmail.com>
@@ -68,10 +68,15 @@ The operation of replacing the left hand side (*lhs*) of a production
 with the right hand side (*rhs*) in a tree (*tree*) is known as
 "expanding" *lhs* to *rhs* in *tree*.
 """
+from __future__ import print_function, unicode_literals, division
+
 import re
 from functools import total_ordering
 
+from six import string_types
+
 from nltk.util import transitive_closure, invert_graph
+from nltk.compat import python_2_unicode_compatible, unicode_repr
 from nltk.internals import raise_unorderable_types
 
 from nltk.probability import ImmutableProbabilisticMixIn
@@ -84,6 +89,7 @@ from nltk.featstruct import FeatStruct, FeatDict, FeatStructReader, SLASH, TYPE
 
 
 @total_ordering
+@python_2_unicode_compatible
 class Nonterminal(object):
     """
     A non-terminal symbol for a context free grammar.  ``Nonterminal``
@@ -150,10 +156,10 @@ class Nonterminal(object):
 
         :rtype: str
         """
-        if isinstance(self._symbol, str):
-            return "%s" % self._symbol
+        if isinstance(self._symbol, string_types):
+            return '%s' % self._symbol
         else:
-            return "%s" % repr(self._symbol)
+            return '%s' % unicode_repr(self._symbol)
 
     def __str__(self):
         """
@@ -161,10 +167,10 @@ class Nonterminal(object):
 
         :rtype: str
         """
-        if isinstance(self._symbol, str):
-            return "%s" % self._symbol
+        if isinstance(self._symbol, string_types):
+            return '%s' % self._symbol
         else:
-            return "%s" % repr(self._symbol)
+            return '%s' % unicode_repr(self._symbol)
 
     def __div__(self, rhs):
         """
@@ -176,7 +182,7 @@ class Nonterminal(object):
         :type rhs: Nonterminal
         :rtype: Nonterminal
         """
-        return Nonterminal("%s/%s" % (self._symbol, rhs._symbol))
+        return Nonterminal('%s/%s' % (self._symbol, rhs._symbol))
 
     def __truediv__(self, rhs):
         """
@@ -206,8 +212,8 @@ def nonterminals(symbols):
         in the same order as the symbols names.
     :rtype: list(Nonterminal)
     """
-    if "," in symbols:
-        symbol_list = symbols.split(",")
+    if ',' in symbols:
+        symbol_list = symbols.split(',')
     else:
         symbol_list = symbols.split()
     return [Nonterminal(s.strip()) for s in symbol_list]
@@ -245,7 +251,7 @@ def is_terminal(item):
 
     :rtype: bool
     """
-    return hasattr(item, "__hash__") and not isinstance(item, Nonterminal)
+    return hasattr(item, '__hash__') and not isinstance(item, Nonterminal)
 
 
 #################################################################
@@ -254,7 +260,7 @@ def is_terminal(item):
 
 
 @total_ordering
-
+@python_2_unicode_compatible
 class Production(object):
     """
     A grammar production.  Each production maps a single symbol
@@ -284,9 +290,9 @@ class Production(object):
         :param rhs: The right-hand side of the new ``Production``.
         :type rhs: sequence(Nonterminal and terminal)
         """
-        if isinstance(rhs, str):
+        if isinstance(rhs, string_types):
             raise TypeError(
-                "production right hand side should be a list, " "not a string"
+                'production right hand side should be a list, ' 'not a string'
             )
         self._lhs = lhs
         self._rhs = tuple(rhs)
@@ -338,8 +344,8 @@ class Production(object):
 
         :rtype: str
         """
-        result = "%s -> " % repr(self._lhs)
-        result += " ".join(repr(el) for el in self._rhs)
+        result = '%s -> ' % unicode_repr(self._lhs)
+        result += " ".join(unicode_repr(el) for el in self._rhs)
         return result
 
     def __repr__(self):
@@ -348,7 +354,7 @@ class Production(object):
 
         :rtype: str
         """
-        return "%s" % self
+        return '%s' % self
 
     def __eq__(self, other):
         """
@@ -379,7 +385,7 @@ class Production(object):
         return self._hash
 
 
-
+@python_2_unicode_compatible
 class DependencyProduction(Production):
     """
     A dependency grammar production.  Each production maps a single
@@ -392,13 +398,13 @@ class DependencyProduction(Production):
 
         :rtype: str
         """
-        result = "'%s' ->" % (self._lhs,)
+        result = '\'%s\' ->' % (self._lhs,)
         for elt in self._rhs:
-            result += " '%s'" % (elt,)
+            result += ' \'%s\'' % (elt,)
         return result
 
 
-
+@python_2_unicode_compatible
 class ProbabilisticProduction(Production, ImmutableProbabilisticMixIn):
     """
     A probabilistic context free grammar production.
@@ -425,8 +431,8 @@ class ProbabilisticProduction(Production, ImmutableProbabilisticMixIn):
         Production.__init__(self, lhs, rhs)
 
     def __str__(self):
-        return super().__str__() + (
-            " [1.0]" if (self.prob() == 1.0) else " [%g]" % self.prob()
+        return Production.__unicode__(self) + (
+            ' [1.0]' if (self.prob() == 1.0) else ' [%g]' % self.prob()
         )
 
     def __eq__(self, other):
@@ -449,7 +455,7 @@ class ProbabilisticProduction(Production, ImmutableProbabilisticMixIn):
 #################################################################
 
 
-
+@python_2_unicode_compatible
 class CFG(object):
     """
     A context-free grammar.  A grammar consists of a start state and
@@ -673,7 +679,7 @@ class CFG(object):
         """
         missing = [tok for tok in tokens if not self._lexical_index.get(tok)]
         if missing:
-            missing = ", ".join("%r" % (w,) for w in missing)
+            missing = ', '.join('%r' % (w,) for w in missing)
             raise ValueError(
                 "Grammar does not cover some of the " "input words: %r." % missing
             )
@@ -747,120 +753,14 @@ class CFG(object):
         """
         return self.is_flexible_chomsky_normal_form() and self._all_unary_are_lexical
 
-    def chomsky_normal_form(self, new_token_padding="@$@", flexible=False):
-        """
-        Returns a new Grammer that is in chomsky normal
-        :param: new_token_padding
-            Customise new rule formation during binarisation
-        """
-        if self.is_chomsky_normal_form():
-            return self
-        if self.productions(empty=True):
-            raise ValueError(
-                ("Grammar has Empty rules. " "Cannot deal with them at the moment")
-            )
-
-        # check for mixed rules
-        for rule in self.productions():
-            if rule.is_lexical() and len(rule.rhs()) > 1:
-                raise ValueError(
-                    "Cannot handled mixed rule {} => {}".format(rule.lhs(), rule.rhs())
-                )
-
-        step1 = CFG.eliminate_start(self)
-        step2 = CFG.binarize(step1, new_token_padding)
-        if flexible:
-            return step2
-        step3 = CFG.remove_unitary_rules(step2)
-        return step3
-
-    @classmethod
-    def remove_unitary_rules(cls, grammar):
-        """
-        Remove nonlexical unitary rules and convert them to
-        lexical
-        """
-        result = []
-        unitary = []
-        for rule in grammar.productions():
-            if len(rule) == 1 and rule.is_nonlexical():
-                unitary.append(rule)
-            else:
-                result.append(rule)
-
-        while unitary:
-            rule = unitary.pop(0)
-            for item in grammar.productions(lhs=rule.rhs()[0]):
-                new_rule = Production(rule.lhs(), item.rhs())
-                if len(new_rule) != 1 or new_rule.is_lexical():
-                    result.append(new_rule)
-                else:
-                    unitary.append(new_rule)
-
-        n_grammar = CFG(grammar.start(), result)
-        return n_grammar
-
-    @classmethod
-    def binarize(cls, grammar, padding="@$@"):
-        """
-        Convert all non-binary rules into binary by introducing
-        new tokens.
-        Example::
-        Original:
-            A => B C D
-        After Conversion:
-            A => B A@$@B
-            A@$@B => C D
-        """
-        result = []
-
-        for rule in grammar.productions():
-            if len(rule.rhs()) > 2:
-                # this rule needs to be broken down
-                left_side = rule.lhs()
-                for k in range(0, len(rule.rhs()) - 2):
-                    tsym = rule.rhs()[k]
-                    new_sym = Nonterminal(left_side.symbol() + padding + tsym.symbol())
-                    new_production = Production(left_side, (tsym, new_sym))
-                    left_side = new_sym
-                    result.append(new_production)
-                last_prd = Production(left_side, rule.rhs()[-2:])
-                result.append(last_prd)
-            else:
-                result.append(rule)
-
-        n_grammar = CFG(grammar.start(), result)
-        return n_grammar
-
-    @classmethod
-    def eliminate_start(cls, grammar):
-        """
-        Eliminate start rule in case it appears on RHS
-        Example: S -> S0 S1 and S0 -> S1 S
-        Then another rule S0_Sigma -> S is added
-        """
-        start = grammar.start()
-        result = []
-        need_to_add = None
-        for rule in grammar.productions():
-            if start in rule.rhs():
-                need_to_add = True
-            result.append(rule)
-        if need_to_add:
-            start = Nonterminal("S0_SIGMA")
-            result.append(Production(start, [grammar.start()]))
-            n_grammar = CFG(start, result)
-            return n_grammar
-        return grammar
-
     def __repr__(self):
-        return "<Grammar with %d productions>" % len(self._productions)
+        return '<Grammar with %d productions>' % len(self._productions)
 
     def __str__(self):
-        result = "Grammar with %d productions" % len(self._productions)
-        result += " (start state = %r)" % self._start
+        result = 'Grammar with %d productions' % len(self._productions)
+        result += ' (start state = %r)' % self._start
         for production in self._productions:
-            result += "\n    %s" % production
+            result += '\n    %s' % production
         return result
 
 
@@ -944,7 +844,7 @@ class FeatureGrammar(CFG):
             )
         elif logic_parser is not None:
             raise Exception(
-                "'logic_parser' and 'fstruct_reader' must " "not both be set"
+                '\'logic_parser\' and \'fstruct_reader\' must ' 'not both be set'
             )
 
         start, productions = read_grammar(
@@ -1020,7 +920,7 @@ class FeatureGrammar(CFG):
 
 
 @total_ordering
-
+@python_2_unicode_compatible
 class FeatureValueType(object):
     """
     A helper class for ``FeatureGrammars``, designed to be different
@@ -1033,7 +933,7 @@ class FeatureValueType(object):
         self._hash = hash(value)
 
     def __repr__(self):
-        return "<%s>" % self._value
+        return '<%s>' % self._value
 
     def __eq__(self, other):
         return type(self) == type(other) and self._value == other._value
@@ -1050,7 +950,7 @@ class FeatureValueType(object):
         return self._hash
 
 
-
+@python_2_unicode_compatible
 class DependencyGrammar(object):
     """
     A dependency grammar.  A DependencyGrammar consists of a set of
@@ -1070,16 +970,16 @@ class DependencyGrammar(object):
     @classmethod
     def fromstring(cls, input):
         productions = []
-        for linenum, line in enumerate(input.split("\n")):
+        for linenum, line in enumerate(input.split('\n')):
             line = line.strip()
-            if line.startswith("#") or line == "":
+            if line.startswith('#') or line == '':
                 continue
             try:
                 productions += _read_dependency_production(line)
             except ValueError:
-                raise ValueError("Unable to parse line %s: %s" % (linenum, line))
+                raise ValueError('Unable to parse line %s: %s' % (linenum, line))
         if len(productions) == 0:
-            raise ValueError("No productions found!")
+            raise ValueError('No productions found!')
         return cls(productions)
 
     def contains(self, head, mod):
@@ -1133,19 +1033,19 @@ class DependencyGrammar(object):
 
         :rtype: str
         """
-        str = "Dependency grammar with %d productions" % len(self._productions)
+        str = 'Dependency grammar with %d productions' % len(self._productions)
         for production in self._productions:
-            str += "\n  %s" % production
+            str += '\n  %s' % production
         return str
 
     def __repr__(self):
         """
         Return a concise string representation of the ``DependencyGrammar``
         """
-        return "Dependency grammar with %d productions" % len(self._productions)
-
+        return 'Dependency grammar with %d productions' % len(self._productions)
 
 
+@python_2_unicode_compatible
 class ProbabilisticDependencyGrammar(object):
     """
 
@@ -1179,24 +1079,24 @@ class ProbabilisticDependencyGrammar(object):
 
         :rtype: str
         """
-        str = "Statistical dependency grammar with %d productions" % len(
+        str = 'Statistical dependency grammar with %d productions' % len(
             self._productions
         )
         for production in self._productions:
-            str += "\n  %s" % production
-        str += "\nEvents:"
+            str += '\n  %s' % production
+        str += '\nEvents:'
         for event in self._events:
-            str += "\n  %d:%s" % (self._events[event], event)
-        str += "\nTags:"
+            str += '\n  %d:%s' % (self._events[event], event)
+        str += '\nTags:'
         for tag_word in self._tags:
-            str += "\n %s:\t(%s)" % (tag_word, self._tags[tag_word])
+            str += '\n %s:\t(%s)' % (tag_word, self._tags[tag_word])
         return str
 
     def __repr__(self):
         """
         Return a concise string representation of the ``ProbabilisticDependencyGrammar``
         """
-        return "Statistical Dependency grammar with %d productions" % len(
+        return 'Statistical Dependency grammar with %d productions' % len(
             self._productions
         )
 
@@ -1331,10 +1231,10 @@ def _read_fcfg_production(input, fstruct_reader):
 
 # Parsing generic grammars
 
-_ARROW_RE = re.compile(r"\s* -> \s*", re.VERBOSE)
-_PROBABILITY_RE = re.compile(r"( \[ [\d\.]+ \] ) \s*", re.VERBOSE)
+_ARROW_RE = re.compile(r'\s* -> \s*', re.VERBOSE)
+_PROBABILITY_RE = re.compile(r'( \[ [\d\.]+ \] ) \s*', re.VERBOSE)
 _TERMINAL_RE = re.compile(r'( "[^"]+" | \'[^\']+\' ) \s*', re.VERBOSE)
-_DISJUNCTION_RE = re.compile(r"\| \s*", re.VERBOSE)
+_DISJUNCTION_RE = re.compile(r'\| \s*', re.VERBOSE)
 
 
 def _read_production(line, nonterm_parser, probabilistic=False):
@@ -1350,7 +1250,7 @@ def _read_production(line, nonterm_parser, probabilistic=False):
     # Skip over the arrow.
     m = _ARROW_RE.match(line, pos)
     if not m:
-        raise ValueError("Expected an arrow")
+        raise ValueError('Expected an arrow')
     pos = m.end()
 
     # Parse the right hand side.
@@ -1364,20 +1264,20 @@ def _read_production(line, nonterm_parser, probabilistic=False):
             probabilities[-1] = float(m.group(1)[1:-1])
             if probabilities[-1] > 1.0:
                 raise ValueError(
-                    "Production probability %f, "
-                    "should not be greater than 1.0" % (probabilities[-1],)
+                    'Production probability %f, '
+                    'should not be greater than 1.0' % (probabilities[-1],)
                 )
 
         # String -- add terminal.
-        elif line[pos] in "'\"":
+        elif line[pos] in "\'\"":
             m = _TERMINAL_RE.match(line, pos)
             if not m:
-                raise ValueError("Unterminated string")
+                raise ValueError('Unterminated string')
             rhsides[-1].append(m.group(1)[1:-1])
             pos = m.end()
 
         # Vertical bar -- start new rhside.
-        elif line[pos] == "|":
+        elif line[pos] == '|':
             m = _DISJUNCTION_RE.match(line, pos)
             probabilities.append(0.0)
             rhsides.append([])
@@ -1419,51 +1319,51 @@ def read_grammar(input, nonterm_parser, probabilistic=False, encoding=None):
     """
     if encoding is not None:
         input = input.decode(encoding)
-    if isinstance(input, str):
-        lines = input.split("\n")
+    if isinstance(input, string_types):
+        lines = input.split('\n')
     else:
         lines = input
 
     start = None
     productions = []
-    continue_line = ""
+    continue_line = ''
     for linenum, line in enumerate(lines):
         line = continue_line + line.strip()
-        if line.startswith("#") or line == "":
+        if line.startswith('#') or line == '':
             continue
-        if line.endswith("\\"):
-            continue_line = line[:-1].rstrip() + " "
+        if line.endswith('\\'):
+            continue_line = line[:-1].rstrip() + ' '
             continue
-        continue_line = ""
+        continue_line = ''
         try:
-            if line[0] == "%":
+            if line[0] == '%':
                 directive, args = line[1:].split(None, 1)
-                if directive == "start":
+                if directive == 'start':
                     start, pos = nonterm_parser(args, 0)
                     if pos != len(args):
-                        raise ValueError("Bad argument to start directive")
+                        raise ValueError('Bad argument to start directive')
                 else:
-                    raise ValueError("Bad directive")
+                    raise ValueError('Bad directive')
             else:
                 # expand out the disjunctions on the RHS
                 productions += _read_production(line, nonterm_parser, probabilistic)
         except ValueError as e:
-            raise ValueError("Unable to parse line %s: %s\n%s" % (linenum + 1, line, e))
+            raise ValueError('Unable to parse line %s: %s\n%s' % (linenum + 1, line, e))
 
     if not productions:
-        raise ValueError("No productions found!")
+        raise ValueError('No productions found!')
     if not start:
         start = productions[0].lhs()
     return (start, productions)
 
 
-_STANDARD_NONTERM_RE = re.compile("( [\w/][\w/^<>-]* ) \s*", re.VERBOSE)
+_STANDARD_NONTERM_RE = re.compile('( [\w/][\w/^<>-]* ) \s*', re.VERBOSE)
 
 
 def standard_nonterm_parser(string, pos):
     m = _STANDARD_NONTERM_RE.match(string, pos)
     if not m:
-        raise ValueError("Expected a nonterminal, found: " + string[pos:])
+        raise ValueError('Expected a nonterminal, found: ' + string[pos:])
     return (Nonterminal(m.group(1)), m.end())
 
 
@@ -1472,7 +1372,7 @@ def standard_nonterm_parser(string, pos):
 #################################################################
 
 _READ_DG_RE = re.compile(
-    r"""^\s*                # leading whitespace
+    r'''^\s*                # leading whitespace
                               ('[^']+')\s*        # single-quoted lhs
                               (?:[-=]+>)\s*        # arrow
                               (?:(                 # rhs:
@@ -1481,24 +1381,24 @@ _READ_DG_RE = re.compile(
                                  | \|              # disjunction
                                  )
                                  \s*)              # trailing space
-                                 *$""",  # zero or more copies
+                                 *$''',  # zero or more copies
     re.VERBOSE,
 )
-_SPLIT_DG_RE = re.compile(r"""('[^']'|[-=]+>|"[^"]+"|'[^']+'|\|)""")
+_SPLIT_DG_RE = re.compile(r'''('[^']'|[-=]+>|"[^"]+"|'[^']+'|\|)''')
 
 
 def _read_dependency_production(s):
     if not _READ_DG_RE.match(s):
-        raise ValueError("Bad production string")
+        raise ValueError('Bad production string')
     pieces = _SPLIT_DG_RE.split(s)
     pieces = [p for i, p in enumerate(pieces) if i % 2 == 1]
-    lhside = pieces[0].strip("'\"")
+    lhside = pieces[0].strip('\'\"')
     rhsides = [[]]
     for piece in pieces[2:]:
-        if piece == "|":
+        if piece == '|':
             rhsides.append([])
         else:
-            rhsides[-1].append(piece.strip("'\""))
+            rhsides[-1].append(piece.strip('\'\"'))
     return [DependencyProduction(lhside, rhside) for rhside in rhsides]
 
 
@@ -1515,12 +1415,12 @@ def cfg_demo():
     from nltk import nonterminals, Production, CFG
 
     # Create some nonterminals
-    S, NP, VP, PP = nonterminals("S, NP, VP, PP")
-    N, V, P, Det = nonterminals("N, V, P, Det")
+    S, NP, VP, PP = nonterminals('S, NP, VP, PP')
+    N, V, P, Det = nonterminals('N, V, P, Det')
     VP_slash_NP = VP / NP
 
-    print("Some nonterminals:", [S, NP, VP, PP, N, V, P, Det, VP / NP])
-    print("    S.symbol() =>", repr(S.symbol()))
+    print('Some nonterminals:', [S, NP, VP, PP, N, V, P, Det, VP / NP])
+    print('    S.symbol() =>', repr(S.symbol()))
     print()
 
     print(Production(S, [NP]))
@@ -1539,11 +1439,11 @@ def cfg_demo():
     """
     )
 
-    print("A Grammar:", repr(grammar))
-    print("    grammar.start()       =>", repr(grammar.start()))
-    print("    grammar.productions() =>", end=" ")
+    print('A Grammar:', repr(grammar))
+    print('    grammar.start()       =>', repr(grammar.start()))
+    print('    grammar.productions() =>', end=' ')
     # Use string.replace(...) is to line-wrap the output.
-    print(repr(grammar.productions()).replace(",", ",\n" + " " * 25))
+    print(repr(grammar.productions()).replace(',', ',\n' + ' ' * 25))
     print()
 
 
@@ -1602,18 +1502,18 @@ def pcfg_demo():
     pcfg_prods = toy_pcfg1.productions()
 
     pcfg_prod = pcfg_prods[2]
-    print("A PCFG production:", repr(pcfg_prod))
-    print("    pcfg_prod.lhs()  =>", repr(pcfg_prod.lhs()))
-    print("    pcfg_prod.rhs()  =>", repr(pcfg_prod.rhs()))
-    print("    pcfg_prod.prob() =>", repr(pcfg_prod.prob()))
+    print('A PCFG production:', repr(pcfg_prod))
+    print('    pcfg_prod.lhs()  =>', repr(pcfg_prod.lhs()))
+    print('    pcfg_prod.rhs()  =>', repr(pcfg_prod.rhs()))
+    print('    pcfg_prod.prob() =>', repr(pcfg_prod.prob()))
     print()
 
     grammar = toy_pcfg2
-    print("A PCFG grammar:", repr(grammar))
-    print("    grammar.start()       =>", repr(grammar.start()))
-    print("    grammar.productions() =>", end=" ")
+    print('A PCFG grammar:', repr(grammar))
+    print('    grammar.start()       =>', repr(grammar.start()))
+    print('    grammar.productions() =>', end=' ')
     # Use .replace(...) is to line-wrap the output.
-    print(repr(grammar.productions()).replace(",", ",\n" + " " * 26))
+    print(repr(grammar.productions()).replace(',', ',\n' + ' ' * 26))
     print()
 
     # extract productions from three trees and induce the PCFG
@@ -1628,7 +1528,7 @@ def pcfg_demo():
 
         productions += tree.productions()
 
-    S = Nonterminal("S")
+    S = Nonterminal('S')
     grammar = induce_pcfg(S, productions)
     print(grammar)
     print()
@@ -1650,7 +1550,7 @@ def pcfg_demo():
 def fcfg_demo():
     import nltk.data
 
-    g = nltk.data.load("grammars/book_grammars/feat0.fcfg")
+    g = nltk.data.load('grammars/book_grammars/feat0.fcfg')
     print(g)
     print()
 
@@ -1706,19 +1606,19 @@ def demo():
     sdg_demo()
 
 
-if __name__ == "__main__":
+if __name__ == '__main__':
     demo()
 
 __all__ = [
-    "Nonterminal",
-    "nonterminals",
-    "CFG",
-    "Production",
-    "PCFG",
-    "ProbabilisticProduction",
-    "DependencyGrammar",
-    "DependencyProduction",
-    "ProbabilisticDependencyGrammar",
-    "induce_pcfg",
-    "read_grammar",
+    'Nonterminal',
+    'nonterminals',
+    'CFG',
+    'Production',
+    'PCFG',
+    'ProbabilisticProduction',
+    'DependencyGrammar',
+    'DependencyProduction',
+    'ProbabilisticDependencyGrammar',
+    'induce_pcfg',
+    'read_grammar',
 ]
index 8b292d6..27671e8 100644 (file)
@@ -1,6 +1,6 @@
 # Natural Language Toolkit (NLTK) Help
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Authors: Steven Bird <stevenbird1@gmail.com>
 # URL: <http://nltk.org/>
 # For license information, see LICENSE.TXT
@@ -8,6 +8,7 @@
 """
 Provide structured access to documentation.
 """
+from __future__ import print_function
 
 import re
 from textwrap import wrap
@@ -37,7 +38,7 @@ def _print_entries(tags, tagdict):
         entry = tagdict[tag]
         defn = [tag + ": " + entry[0]]
         examples = wrap(
-            entry[1], width=75, initial_indent="    ", subsequent_indent="    "
+            entry[1], width=75, initial_indent='    ', subsequent_indent='    '
         )
         print("\n".join(defn + examples))
 
@@ -57,8 +58,8 @@ def _format_tagset(tagset, tagpattern=None):
             print("No matching tags found.")
 
 
-if __name__ == "__main__":
-    brown_tagset(r"NN.*")
-    upenn_tagset(r".*\$")
-    claws5_tagset("UNDEFINED")
-    brown_tagset(r"NN")
+if __name__ == '__main__':
+    brown_tagset(r'NN.*')
+    upenn_tagset(r'.*\$')
+    claws5_tagset('UNDEFINED')
+    brown_tagset(r'NN')
index fdd653d..d79c935 100644 (file)
@@ -1,6 +1,6 @@
 # Natural Language Toolkit: Inference
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Dan Garrette <dhgarrette@gmail.com>
 #         Ewan Klein <ewan@inf.ed.ac.uk>
 #
index abbb072..1255a72 100644 (file)
Binary files a/nlp_resource_data/nltk/inference/__pycache__/__init__.cpython-37.pyc and b/nlp_resource_data/nltk/inference/__pycache__/__init__.cpython-37.pyc differ
index 478b83d..06c3cc7 100644 (file)
Binary files a/nlp_resource_data/nltk/inference/__pycache__/api.cpython-37.pyc and b/nlp_resource_data/nltk/inference/__pycache__/api.cpython-37.pyc differ
index db0b098..1f8f18f 100644 (file)
Binary files a/nlp_resource_data/nltk/inference/__pycache__/discourse.cpython-37.pyc and b/nlp_resource_data/nltk/inference/__pycache__/discourse.cpython-37.pyc differ
index 3dfabb3..7c3bb47 100644 (file)
Binary files a/nlp_resource_data/nltk/inference/__pycache__/mace.cpython-37.pyc and b/nlp_resource_data/nltk/inference/__pycache__/mace.cpython-37.pyc differ
index 24af18a..e6f7652 100644 (file)
Binary files a/nlp_resource_data/nltk/inference/__pycache__/nonmonotonic.cpython-37.pyc and b/nlp_resource_data/nltk/inference/__pycache__/nonmonotonic.cpython-37.pyc differ
index 1c27213..ffec0b9 100644 (file)
Binary files a/nlp_resource_data/nltk/inference/__pycache__/prover9.cpython-37.pyc and b/nlp_resource_data/nltk/inference/__pycache__/prover9.cpython-37.pyc differ
index 21f4fd2..016d28d 100644 (file)
Binary files a/nlp_resource_data/nltk/inference/__pycache__/resolution.cpython-37.pyc and b/nlp_resource_data/nltk/inference/__pycache__/resolution.cpython-37.pyc differ
index 525035f..8ce007c 100644 (file)
Binary files a/nlp_resource_data/nltk/inference/__pycache__/tableau.cpython-37.pyc and b/nlp_resource_data/nltk/inference/__pycache__/tableau.cpython-37.pyc differ
index 3135e1b..3bc8ad3 100644 (file)
@@ -17,13 +17,17 @@ the model builder tries to build a model for the assumptions. Given a set of ass
 goal *G*, the model builder tries to find a counter-model, in the sense of a model that will satisfy
 the assumptions plus the negation of *G*.
 """
+from __future__ import print_function
 
 from abc import ABCMeta, abstractmethod
 import threading
 import time
 
+from six import add_metaclass
 
-class Prover(metaclass=ABCMeta):
+
+@add_metaclass(ABCMeta)
+class Prover(object):
     """
     Interface for trying to prove a goal from assumptions.  Both the goal and
     the assumptions are constrained to be formulas of ``logic.Expression``.
@@ -44,7 +48,8 @@ class Prover(metaclass=ABCMeta):
         """
 
 
-class ModelBuilder(metaclass=ABCMeta):
+@add_metaclass(ABCMeta)
+class ModelBuilder(object):
     """
     Interface for trying to build a model of set of formulas.
     Open formulas are assumed to be universally quantified.
@@ -69,7 +74,8 @@ class ModelBuilder(metaclass=ABCMeta):
         """
 
 
-class TheoremToolCommand(metaclass=ABCMeta):
+@add_metaclass(ABCMeta)
+class TheoremToolCommand(object):
     """
     This class holds a goal and a list of assumptions to be used in proving
     or model building.
@@ -350,7 +356,7 @@ class BaseModelBuilderCommand(BaseTheoremToolCommand, ModelBuilderCommand):
         :return: str
         """
         if self._result is None:
-            raise LookupError("You have to call build_model() first to " "get a model!")
+            raise LookupError('You have to call build_model() first to ' 'get a model!')
         else:
             return self._decorate_model(self._model, format)
 
@@ -484,7 +490,7 @@ class ModelBuilderCommandDecorator(TheoremToolCommandDecorator, ModelBuilderComm
         :return: str
         """
         if self._result is None:
-            raise LookupError("You have to call build_model() first to " "get a model!")
+            raise LookupError('You have to call build_model() first to ' 'get a model!')
         else:
             return self._decorate_model(self._model, format)
 
@@ -514,20 +520,20 @@ class ParallelProverBuilder(Prover, ModelBuilder):
         self._modelbuilder = modelbuilder
 
     def _prove(self, goal=None, assumptions=None, verbose=False):
-        return self._run(goal, assumptions, verbose), ""
+        return self._run(goal, assumptions, verbose), ''
 
     def _build_model(self, goal=None, assumptions=None, verbose=False):
-        return not self._run(goal, assumptions, verbose), ""
+        return not self._run(goal, assumptions, verbose), ''
 
     def _run(self, goal, assumptions, verbose):
         # Set up two thread, Prover and ModelBuilder to run in parallel
         tp_thread = TheoremToolThread(
-            lambda: self._prover.prove(goal, assumptions, verbose), verbose, "TP"
+            lambda: self._prover.prove(goal, assumptions, verbose), verbose, 'TP'
         )
         mb_thread = TheoremToolThread(
             lambda: self._modelbuilder.build_model(goal, assumptions, verbose),
             verbose,
-            "MB",
+            'MB',
         )
 
         tp_thread.start()
@@ -569,10 +575,10 @@ class ParallelProverBuilderCommand(BaseProverCommand, BaseModelBuilderCommand):
     def _run(self, verbose):
         # Set up two thread, Prover and ModelBuilder to run in parallel
         tp_thread = TheoremToolThread(
-            lambda: BaseProverCommand.prove(self, verbose), verbose, "TP"
+            lambda: BaseProverCommand.prove(self, verbose), verbose, 'TP'
         )
         mb_thread = TheoremToolThread(
-            lambda: BaseModelBuilderCommand.build_model(self, verbose), verbose, "MB"
+            lambda: BaseModelBuilderCommand.build_model(self, verbose), verbose, 'MB'
         )
 
         tp_thread.start()
@@ -602,12 +608,12 @@ class TheoremToolThread(threading.Thread):
             self._result = self._command()
             if self._verbose:
                 print(
-                    "Thread %s finished with result %s at %s"
+                    'Thread %s finished with result %s at %s'
                     % (self._name, self._result, time.localtime(time.time()))
                 )
         except Exception as e:
             print(e)
-            print("Thread %s completed abnormally" % (self._name))
+            print('Thread %s completed abnormally' % (self._name))
 
     @property
     def result(self):
index 5d4065c..7dad02d 100644 (file)
@@ -42,12 +42,14 @@ The set of all threads for a discourse is the Cartesian product of all the readi
 (This is not intended to scale beyond very short discourses!) The method ``readings(filter=True)`` will only show
 those threads which are consistent (taking into account any background assumptions).
 """
+from __future__ import print_function
 
 import os
 from abc import ABCMeta, abstractmethod
 from operator import and_, add
 from functools import reduce
 
+from six import add_metaclass
 
 from nltk.data import show_cfg
 from nltk.tag import RegexpTagger
@@ -61,7 +63,8 @@ from nltk.inference.mace import MaceCommand
 from nltk.inference.prover9 import Prover9Command
 
 
-class ReadingCommand(metaclass=ABCMeta):
+@add_metaclass(ABCMeta)
+class ReadingCommand(object):
     @abstractmethod
     def parse_to_readings(self, sentence):
         """
@@ -109,7 +112,7 @@ class CfgReadingCommand(ReadingCommand):
         :type gramfile: str
         """
         self._gramfile = (
-            gramfile if gramfile else "grammars/book_grammars/discourse.fcfg"
+            gramfile if gramfile else 'grammars/book_grammars/discourse.fcfg'
         )
         self._parser = load_parser(self._gramfile)
 
@@ -139,7 +142,7 @@ class DrtGlueReadingCommand(ReadingCommand):
         """
         if semtype_file is None:
             semtype_file = os.path.join(
-                "grammars", "sample_grammars", "drt_glue.semtype"
+                'grammars', 'sample_grammars', 'drt_glue.semtype'
             )
         self._glue = DrtGlue(
             semtype_file=semtype_file,
@@ -183,7 +186,7 @@ class DiscourseTester(object):
         :type background: list(Expression)
         """
         self._input = input
-        self._sentences = dict([("s%s" % i, sent) for i, sent in enumerate(input)])
+        self._sentences = dict([('s%s' % i, sent) for i, sent in enumerate(input)])
         self._models = None
         self._readings = {}
         self._reading_command = (
@@ -239,7 +242,7 @@ class DiscourseTester(object):
 
         self._input.append(sentence)
         self._sentences = dict(
-            [("s%s" % i, sent) for i, sent in enumerate(self._input)]
+            [('s%s' % i, sent) for i, sent in enumerate(self._input)]
         )
         # check whether adding the new sentence to the discourse preserves consistency (i.e. a model can be found for the combined set of
         # of assumptions
@@ -266,7 +269,7 @@ class DiscourseTester(object):
             self.sentences()
             return None
         self._sentences = dict(
-            [("s%s" % i, sent) for i, sent in enumerate(self._input)]
+            [('s%s' % i, sent) for i, sent in enumerate(self._input)]
         )
         self.readings(verbose=False)
         if verbose:
@@ -337,7 +340,7 @@ class DiscourseTester(object):
         else:
             for sid in sorted(self._readings):
                 print()
-                print("%s readings:" % sid)
+                print('%s readings:' % sid)
                 print()  #'-' * 30
                 for rid in sorted(self._readings[sid]):
                     lf = self._readings[sid][rid]
@@ -351,7 +354,7 @@ class DiscourseTester(object):
         for tid in sorted(threads):
             if show_thread_readings:
                 readings = [
-                    self._readings[rid.split("-")[0]][rid] for rid in self._threads[tid]
+                    self._readings[rid.split('-')[0]][rid] for rid in self._threads[tid]
                 ]
                 try:
                     thread_reading = (
@@ -359,9 +362,9 @@ class DiscourseTester(object):
                         % self._reading_command.combine_readings(readings).normalize()
                     )
                 except Exception as e:
-                    thread_reading = ": INVALID: %s" % e.__class__.__name__
+                    thread_reading = ': INVALID: %s' % e.__class__.__name__
             else:
-                thread_reading = ""
+                thread_reading = ''
 
             print("%s:" % tid, self._threads[tid], thread_reading)
 
@@ -412,7 +415,7 @@ class DiscourseTester(object):
         return [
             (rid, self._readings[sid][rid])
             for rid in threads[thread_id]
-            for sid in rid.split("-")[:1]
+            for sid in rid.split('-')[:1]
         ]
 
     ###############################
@@ -448,7 +451,7 @@ class DiscourseTester(object):
                         print(a)
                     spacer(80)
                 if modelfound:
-                    print(mb.model(format="cooked"))
+                    print(mb.model(format='cooked'))
                 else:
                     print("No model found!\n")
         return results
@@ -536,6 +539,12 @@ class DiscourseTester(object):
         return result
 
 
+# multiply = DiscourseTester.multiply
+# L1 = [['A'], ['B']]
+# L2 = ['a', 'b', 'c']
+# print multiply(L1,L2)
+
+
 def load_fol(s):
     """
     Temporarily duplicated from ``nltk.sem.util``.
@@ -549,12 +558,12 @@ def load_fol(s):
     statements = []
     for linenum, line in enumerate(s.splitlines()):
         line = line.strip()
-        if line.startswith("#") or line == "":
+        if line.startswith('#') or line == '':
             continue
         try:
             statements.append(Expression.fromstring(line))
         except Exception:
-            raise ValueError("Unable to parse line %s: %s" % (linenum, line))
+            raise ValueError('Unable to parse line %s: %s' % (linenum, line))
     return statements
 
 
@@ -566,7 +575,7 @@ def discourse_demo(reading_command=None):
     Illustrate the various methods of ``DiscourseTester``
     """
     dt = DiscourseTester(
-        ["A boxer walks", "Every boxer chases a girl"], reading_command
+        ['A boxer walks', 'Every boxer chases a girl'], reading_command
     )
     dt.models()
     print()
@@ -578,36 +587,36 @@ def discourse_demo(reading_command=None):
     print()
     dt.readings(threaded=True)
     print()
-    dt.models("d1")
-    dt.add_sentence("John is a boxer")
+    dt.models('d1')
+    dt.add_sentence('John is a boxer')
     print()
     dt.sentences()
     print()
     dt.readings(threaded=True)
     print()
     dt = DiscourseTester(
-        ["A student dances", "Every student is a person"], reading_command
+        ['A student dances', 'Every student is a person'], reading_command
     )
     print()
-    dt.add_sentence("No person dances", consistchk=True)
+    dt.add_sentence('No person dances', consistchk=True)
     print()
     dt.readings()
     print()
-    dt.retract_sentence("No person dances", verbose=True)
+    dt.retract_sentence('No person dances', verbose=True)
     print()
     dt.models()
     print()
-    dt.readings("A person dances")
+    dt.readings('A person dances')
     print()
-    dt.add_sentence("A person dances", informchk=True)
+    dt.add_sentence('A person dances', informchk=True)
     dt = DiscourseTester(
-        ["Vincent is a boxer", "Fido is a boxer", "Vincent is married", "Fido barks"],
+        ['Vincent is a boxer', 'Fido is a boxer', 'Vincent is married', 'Fido barks'],
         reading_command,
     )
     dt.readings(filter=True)
     import nltk.data
 
-    background_file = os.path.join("grammars", "book_grammars", "background.fol")
+    background_file = os.path.join('grammars', 'book_grammars', 'background.fol')
     background = nltk.data.load(background_file)
 
     print()
@@ -623,7 +632,7 @@ def drt_discourse_demo(reading_command=None):
     """
     Illustrate the various methods of ``DiscourseTester``
     """
-    dt = DiscourseTester(["every dog chases a boy", "he runs"], reading_command)
+    dt = DiscourseTester(['every dog chases a boy', 'he runs'], reading_command)
     dt.models()
     print()
     dt.sentences()
@@ -636,7 +645,7 @@ def drt_discourse_demo(reading_command=None):
 
 
 def spacer(num=30):
-    print("-" * num)
+    print('-' * num)
 
 
 def demo():
@@ -644,11 +653,11 @@ def demo():
 
     tagger = RegexpTagger(
         [
-            ("^(chases|runs)$", "VB"),
-            ("^(a)$", "ex_quant"),
-            ("^(every)$", "univ_quant"),
-            ("^(dog|boy)$", "NN"),
-            ("^(he)$", "PRP"),
+            ('^(chases|runs)$', 'VB'),
+            ('^(a)$', 'ex_quant'),
+            ('^(every)$', 'univ_quant'),
+            ('^(dog|boy)$', 'NN'),
+            ('^(he)$', 'PRP'),
         ]
     )
     depparser = MaltParser(tagger=tagger)
@@ -657,5 +666,5 @@ def demo():
     )
 
 
-if __name__ == "__main__":
+if __name__ == '__main__':
     demo()
index 159a510..7763b75 100644 (file)
@@ -9,6 +9,7 @@
 """
 A model builder that makes use of the external 'Mace4' package.
 """
+from __future__ import print_function
 
 import os
 import tempfile
@@ -49,7 +50,7 @@ class MaceCommand(Prover9CommandParent, BaseModelBuilderCommand):
 
     @property
     def valuation(mbc):
-        return mbc.model("valuation")
+        return mbc.model('valuation')
 
     def _convert2val(self, valuation_str):
         """
@@ -58,40 +59,40 @@ class MaceCommand(Prover9CommandParent, BaseModelBuilderCommand):
         :return: A model if one is generated; None otherwise.
         :rtype: sem.Valuation
         """
-        valuation_standard_format = self._transform_output(valuation_str, "standard")
+        valuation_standard_format = self._transform_output(valuation_str, 'standard')
 
         val = []
         for line in valuation_standard_format.splitlines(False):
             l = line.strip()
 
-            if l.startswith("interpretation"):
+            if l.startswith('interpretation'):
                 # find the number of entities in the model
-                num_entities = int(l[l.index("(") + 1 : l.index(",")].strip())
+                num_entities = int(l[l.index('(') + 1 : l.index(',')].strip())
 
-            elif l.startswith("function") and l.find("_") == -1:
+            elif l.startswith('function') and l.find('_') == -1:
                 # replace the integer identifier with a corresponding alphabetic character
-                name = l[l.index("(") + 1 : l.index(",")].strip()
+                name = l[l.index('(') + 1 : l.index(',')].strip()
                 if is_indvar(name):
                     name = name.upper()
-                value = int(l[l.index("[") + 1 : l.index("]")].strip())
+                value = int(l[l.index('[') + 1 : l.index(']')].strip())
                 val.append((name, MaceCommand._make_model_var(value)))
 
-            elif l.startswith("relation"):
-                l = l[l.index("(") + 1 :]
-                if "(" in l:
+            elif l.startswith('relation'):
+                l = l[l.index('(') + 1 :]
+                if '(' in l:
                     # relation is not nullary
-                    name = l[: l.index("(")].strip()
+                    name = l[: l.index('(')].strip()
                     values = [
                         int(v.strip())
-                        for v in l[l.index("[") + 1 : l.index("]")].split(",")
+                        for v in l[l.index('[') + 1 : l.index(']')].split(',')
                     ]
                     val.append(
                         (name, MaceCommand._make_relation_set(num_entities, values))
                     )
                 else:
                     # relation is nullary
-                    name = l[: l.index(",")].strip()
-                    value = int(l[l.index("[") + 1 : l.index("]")].strip())
+                    name = l[: l.index(',')].strip()
+                    value = int(l[l.index('[') + 1 : l.index(']')].strip())
                     val.append((name, value == 1))
 
         return Valuation(val)
@@ -140,32 +141,32 @@ class MaceCommand(Prover9CommandParent, BaseModelBuilderCommand):
         :type value: int
         """
         letter = [
-            "a",
-            "b",
-            "c",
-            "d",
-            "e",
-            "f",
-            "g",
-            "h",
-            "i",
-            "j",
-            "k",
-            "l",
-            "m",
-            "n",
-            "o",
-            "p",
-            "q",
-            "r",
-            "s",
-            "t",
-            "u",
-            "v",
-            "w",
-            "x",
-            "y",
-            "z",
+            'a',
+            'b',
+            'c',
+            'd',
+            'e',
+            'f',
+            'g',
+            'h',
+            'i',
+            'j',
+            'k',
+            'l',
+            'm',
+            'n',
+            'o',
+            'p',
+            'q',
+            'r',
+            's',
+            't',
+            'u',
+            'v',
+            'w',
+            'x',
+            'y',
+            'z',
         ][value]
         num = value // 26
         return letter + str(num) if num > 0 else letter
@@ -182,7 +183,7 @@ class MaceCommand(Prover9CommandParent, BaseModelBuilderCommand):
         """
         if not format:
             return valuation_str
-        elif format == "valuation":
+        elif format == 'valuation':
             return self._convert2val(valuation_str)
         else:
             return self._transform_output(valuation_str, format)
@@ -195,14 +196,14 @@ class MaceCommand(Prover9CommandParent, BaseModelBuilderCommand):
         :type format: str
         """
         if format in [
-            "standard",
-            "standard2",
-            "portable",
-            "tabular",
-            "raw",
-            "cooked",
-            "xml",
-            "tex",
+            'standard',
+            'standard2',
+            'portable',
+            'tabular',
+            'raw',
+            'cooked',
+            'xml',
+            'tex',
         ]:
             return self._call_interpformat(valuation_str, [format])[0]
         else:
@@ -219,7 +220,7 @@ class MaceCommand(Prover9CommandParent, BaseModelBuilderCommand):
         """
         if self._interpformat_bin is None:
             self._interpformat_bin = self._modelbuilder._find_binary(
-                "interpformat", verbose
+                'interpformat', verbose
             )
 
         return self._modelbuilder._call(
@@ -260,18 +261,18 @@ class Mace(Prover9Parent, ModelBuilder):
         :see: ``config_prover9``
         """
         if self._mace4_bin is None:
-            self._mace4_bin = self._find_binary("mace4", verbose)
+            self._mace4_bin = self._find_binary('mace4', verbose)
 
-        updated_input_str = ""
+        updated_input_str = ''
         if self._end_size > 0:
-            updated_input_str += "assign(end_size, %d).\n\n" % self._end_size
+            updated_input_str += 'assign(end_size, %d).\n\n' % self._end_size
         updated_input_str += input_str
 
         return self._call(updated_input_str, self._mace4_bin, args, verbose)
 
 
 def spacer(num=30):
-    print("-" * num)
+    print('-' * num)
 
 
 def decode_result(found):
@@ -281,7 +282,7 @@ def decode_result(found):
     :param found: The output of model_found()
     :type found: bool
     """
-    return {True: "Countermodel found", False: "No countermodel found", None: "None"}[
+    return {True: 'Countermodel found', False: 'No countermodel found', None: 'None'}[
         found
     ]
 
@@ -296,24 +297,24 @@ def test_model_found(arguments):
         m = MaceCommand(g, assumptions=alist, max_models=50)
         found = m.build_model()
         for a in alist:
-            print("   %s" % a)
-        print("|- %s: %s\n" % (g, decode_result(found)))
+            print('   %s' % a)
+        print('|- %s: %s\n' % (g, decode_result(found)))
 
 
 def test_build_model(arguments):
     """
     Try to build a ``nltk.sem.Valuation``.
     """
-    g = Expression.fromstring("all x.man(x)")
+    g = Expression.fromstring('all x.man(x)')
     alist = [
         Expression.fromstring(a)
         for a in [
-            "man(John)",
-            "man(Socrates)",
-            "man(Bill)",
-            "some x.(-(x = John) & man(x) & sees(John,x))",
-            "some x.(-(x = Bill) & man(x))",
-            "all x.some y.(man(x) -> gives(Socrates,x,y))",
+            'man(John)',
+            'man(Socrates)',
+            'man(Bill)',
+            'some x.(-(x = John) & man(x) & sees(John,x))',
+            'some x.(-(x = Bill) & man(x))',
+            'all x.some y.(man(x) -> gives(Socrates,x,y))',
         ]
     ]
 
@@ -323,14 +324,14 @@ def test_build_model(arguments):
     print("Assumptions and Goal")
     spacer()
     for a in alist:
-        print("   %s" % a)
-    print("|- %s: %s\n" % (g, decode_result(m.build_model())))
+        print('   %s' % a)
+    print('|- %s: %s\n' % (g, decode_result(m.build_model())))
     spacer()
-    # print(m.model('standard'))
-    # print(m.model('cooked'))
+    # print m.model('standard')
+    # print m.model('cooked')
     print("Valuation")
     spacer()
-    print(m.valuation, "\n")
+    print(m.valuation, '\n')
 
 
 def test_transform_output(argument_pair):
@@ -342,9 +343,9 @@ def test_transform_output(argument_pair):
     m = MaceCommand(g, assumptions=alist)
     m.build_model()
     for a in alist:
-        print("   %s" % a)
-    print("|- %s: %s\n" % (g, m.build_model()))
-    for format in ["standard", "portable", "xml", "cooked"]:
+        print('   %s' % a)
+    print('|- %s: %s\n' % (g, m.build_model()))
+    for format in ['standard', 'portable', 'xml', 'cooked']:
         spacer()
         print("Using '%s' format" % format)
         spacer()
@@ -354,23 +355,23 @@ def test_transform_output(argument_pair):
 def test_make_relation_set():
     print(
         MaceCommand._make_relation_set(num_entities=3, values=[1, 0, 1])
-        == set([("c",), ("a",)])
+        == set([('c',), ('a',)])
     )
     print(
         MaceCommand._make_relation_set(
             num_entities=3, values=[0, 0, 0, 0, 0, 0, 1, 0, 0]
         )
-        == set([("c", "a")])
+        == set([('c', 'a')])
     )
     print(
         MaceCommand._make_relation_set(num_entities=2, values=[0, 0, 1, 0, 0, 0, 1, 0])
-        == set([("a", "b", "a"), ("b", "b", "a")])
+        == set([('a', 'b', 'a'), ('b', 'b', 'a')])
     )
 
 
 arguments = [
-    ("mortal(Socrates)", ["all x.(man(x) -> mortal(x))", "man(Socrates)"]),
-    ("(not mortal(Socrates))", ["all x.(man(x) -> mortal(x))", "man(Socrates)"]),
+    ('mortal(Socrates)', ['all x.(man(x) -> mortal(x))', 'man(Socrates)']),
+    ('(not mortal(Socrates))', ['all x.(man(x) -> mortal(x))', 'man(Socrates)']),
 ]
 
 
@@ -380,5 +381,5 @@ def demo():
     test_transform_output(arguments[1])
 
 
-if __name__ == "__main__":
+if __name__ == '__main__':
     demo()
index 3bbb8d2..b9180f0 100644 (file)
@@ -2,7 +2,7 @@
 #
 # Author: Daniel H. Garrette <dhgarrette@gmail.com>
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # URL: <http://nltk.org>
 # For license information, see LICENSE.TXT
 
@@ -11,6 +11,7 @@ A module to perform nonmonotonic reasoning.  The ideas and demonstrations in
 this module are based on "Logical Foundations of Artificial Intelligence" by
 Michael R. Genesereth and Nils J. Nilsson.
 """
+from __future__ import print_function, unicode_literals
 
 from collections import defaultdict
 from functools import reduce
@@ -34,6 +35,7 @@ from nltk.sem.logic import (
 )
 
 from nltk.inference.api import Prover, ProverCommandDecorator
+from nltk.compat import python_2_unicode_compatible
 
 
 class ProverParseError(Exception):
@@ -299,6 +301,7 @@ class ClosedWorldProver(ProverCommandDecorator):
                         predDict[func1].validate_sig_len(sig)
 
 
+@python_2_unicode_compatible
 class PredHolder(object):
     """
     This class will be used by a dictionary that will store information
@@ -335,7 +338,7 @@ class PredHolder(object):
             raise Exception("Signature lengths do not match")
 
     def __str__(self):
-        return "(%s,%s,%s)" % (self.signatures, self.properties, self.signature_len)
+        return '(%s,%s,%s)' % (self.signatures, self.properties, self.signature_len)
 
     def __repr__(self):
         return "%s" % self
@@ -344,151 +347,151 @@ class PredHolder(object):
 def closed_domain_demo():
     lexpr = Expression.fromstring
 
-    p1 = lexpr(r"exists x.walk(x)")
-    p2 = lexpr(r"man(Socrates)")
-    c = lexpr(r"walk(Socrates)")
+    p1 = lexpr(r'exists x.walk(x)')
+    p2 = lexpr(r'man(Socrates)')
+    c = lexpr(r'walk(Socrates)')
     prover = Prover9Command(c, [p1, p2])
     print(prover.prove())
     cdp = ClosedDomainProver(prover)
-    print("assumptions:")
+    print('assumptions:')
     for a in cdp.assumptions():
-        print("   ", a)
-    print("goal:", cdp.goal())
+        print('   ', a)
+    print('goal:', cdp.goal())
     print(cdp.prove())
 
-    p1 = lexpr(r"exists x.walk(x)")
-    p2 = lexpr(r"man(Socrates)")
-    p3 = lexpr(r"-walk(Bill)")
-    c = lexpr(r"walk(Socrates)")
+    p1 = lexpr(r'exists x.walk(x)')
+    p2 = lexpr(r'man(Socrates)')
+    p3 = lexpr(r'-walk(Bill)')
+    c = lexpr(r'walk(Socrates)')
     prover = Prover9Command(c, [p1, p2, p3])
     print(prover.prove())
     cdp = ClosedDomainProver(prover)
-    print("assumptions:")
+    print('assumptions:')
     for a in cdp.assumptions():
-        print("   ", a)
-    print("goal:", cdp.goal())
+        print('   ', a)
+    print('goal:', cdp.goal())
     print(cdp.prove())
 
-    p1 = lexpr(r"exists x.walk(x)")
-    p2 = lexpr(r"man(Socrates)")
-    p3 = lexpr(r"-walk(Bill)")
-    c = lexpr(r"walk(Socrates)")
+    p1 = lexpr(r'exists x.walk(x)')
+    p2 = lexpr(r'man(Socrates)')
+    p3 = lexpr(r'-walk(Bill)')
+    c = lexpr(r'walk(Socrates)')
     prover = Prover9Command(c, [p1, p2, p3])
     print(prover.prove())
     cdp = ClosedDomainProver(prover)
-    print("assumptions:")
+    print('assumptions:')
     for a in cdp.assumptions():
-        print("   ", a)
-    print("goal:", cdp.goal())
+        print('   ', a)
+    print('goal:', cdp.goal())
     print(cdp.prove())
 
-    p1 = lexpr(r"walk(Socrates)")
-    p2 = lexpr(r"walk(Bill)")
-    c = lexpr(r"all x.walk(x)")
+    p1 = lexpr(r'walk(Socrates)')
+    p2 = lexpr(r'walk(Bill)')
+    c = lexpr(r'all x.walk(x)')
     prover = Prover9Command(c, [p1, p2])
     print(prover.prove())
     cdp = ClosedDomainProver(prover)
-    print("assumptions:")
+    print('assumptions:')
     for a in cdp.assumptions():
-        print("   ", a)
-    print("goal:", cdp.goal())
+        print('   ', a)
+    print('goal:', cdp.goal())
     print(cdp.prove())
 
-    p1 = lexpr(r"girl(mary)")
-    p2 = lexpr(r"dog(rover)")
-    p3 = lexpr(r"all x.(girl(x) -> -dog(x))")
-    p4 = lexpr(r"all x.(dog(x) -> -girl(x))")
-    p5 = lexpr(r"chase(mary, rover)")
-    c = lexpr(r"exists y.(dog(y) & all x.(girl(x) -> chase(x,y)))")
+    p1 = lexpr(r'girl(mary)')
+    p2 = lexpr(r'dog(rover)')
+    p3 = lexpr(r'all x.(girl(x) -> -dog(x))')
+    p4 = lexpr(r'all x.(dog(x) -> -girl(x))')
+    p5 = lexpr(r'chase(mary, rover)')
+    c = lexpr(r'exists y.(dog(y) & all x.(girl(x) -> chase(x,y)))')
     prover = Prover9Command(c, [p1, p2, p3, p4, p5])
     print(prover.prove())
     cdp = ClosedDomainProver(prover)
-    print("assumptions:")
+    print('assumptions:')
     for a in cdp.assumptions():
-        print("   ", a)
-    print("goal:", cdp.goal())
+        print('   ', a)
+    print('goal:', cdp.goal())
     print(cdp.prove())
 
 
 def unique_names_demo():
     lexpr = Expression.fromstring
 
-    p1 = lexpr(r"man(Socrates)")
-    p2 = lexpr(r"man(Bill)")
-    c = lexpr(r"exists x.exists y.(x != y)")
+    p1 = lexpr(r'man(Socrates)')
+    p2 = lexpr(r'man(Bill)')
+    c = lexpr(r'exists x.exists y.(x != y)')
     prover = Prover9Command(c, [p1, p2])
     print(prover.prove())
     unp = UniqueNamesProver(prover)
-    print("assumptions:")
+    print('assumptions:')
     for a in unp.assumptions():
-        print("   ", a)
-    print("goal:", unp.goal())
+        print('   ', a)
+    print('goal:', unp.goal())
     print(unp.prove())
 
-    p1 = lexpr(r"all x.(walk(x) -> (x = Socrates))")
-    p2 = lexpr(r"Bill = William")
-    p3 = lexpr(r"Bill = Billy")
-    c = lexpr(r"-walk(William)")
+    p1 = lexpr(r'all x.(walk(x) -> (x = Socrates))')
+    p2 = lexpr(r'Bill = William')
+    p3 = lexpr(r'Bill = Billy')
+    c = lexpr(r'-walk(William)')
     prover = Prover9Command(c, [p1, p2, p3])
     print(prover.prove())
     unp = UniqueNamesProver(prover)
-    print("assumptions:")
+    print('assumptions:')
     for a in unp.assumptions():
-        print("   ", a)
-    print("goal:", unp.goal())
+        print('   ', a)
+    print('goal:', unp.goal())
     print(unp.prove())
 
 
 def closed_world_demo():
     lexpr = Expression.fromstring
 
-    p1 = lexpr(r"walk(Socrates)")
-    p2 = lexpr(r"(Socrates != Bill)")
-    c = lexpr(r"-walk(Bill)")
+    p1 = lexpr(r'walk(Socrates)')
+    p2 = lexpr(r'(Socrates != Bill)')
+    c = lexpr(r'-walk(Bill)')
     prover = Prover9Command(c, [p1, p2])
     print(prover.prove())
     cwp = ClosedWorldProver(prover)
-    print("assumptions:")
+    print('assumptions:')
     for a in cwp.assumptions():
-        print("   ", a)
-    print("goal:", cwp.goal())
+        print('   ', a)
+    print('goal:', cwp.goal())
     print(cwp.prove())
 
-    p1 = lexpr(r"see(Socrates, John)")
-    p2 = lexpr(r"see(John, Mary)")
-    p3 = lexpr(r"(Socrates != John)")
-    p4 = lexpr(r"(John != Mary)")
-    c = lexpr(r"-see(Socrates, Mary)")
+    p1 = lexpr(r'see(Socrates, John)')
+    p2 = lexpr(r'see(John, Mary)')
+    p3 = lexpr(r'(Socrates != John)')
+    p4 = lexpr(r'(John != Mary)')
+    c = lexpr(r'-see(Socrates, Mary)')
     prover = Prover9Command(c, [p1, p2, p3, p4])
     print(prover.prove())
     cwp = ClosedWorldProver(prover)
-    print("assumptions:")
+    print('assumptions:')
     for a in cwp.assumptions():
-        print("   ", a)
-    print("goal:", cwp.goal())
+        print('   ', a)
+    print('goal:', cwp.goal())
     print(cwp.prove())
 
-    p1 = lexpr(r"all x.(ostrich(x) -> bird(x))")
-    p2 = lexpr(r"bird(Tweety)")
-    p3 = lexpr(r"-ostrich(Sam)")
-    p4 = lexpr(r"Sam != Tweety")
-    c = lexpr(r"-bird(Sam)")
+    p1 = lexpr(r'all x.(ostrich(x) -> bird(x))')
+    p2 = lexpr(r'bird(Tweety)')
+    p3 = lexpr(r'-ostrich(Sam)')
+    p4 = lexpr(r'Sam != Tweety')
+    c = lexpr(r'-bird(Sam)')
     prover = Prover9Command(c, [p1, p2, p3, p4])
     print(prover.prove())
     cwp = ClosedWorldProver(prover)
-    print("assumptions:")
+    print('assumptions:')
     for a in cwp.assumptions():
-        print("   ", a)
-    print("goal:", cwp.goal())
+        print('   ', a)
+    print('goal:', cwp.goal())
     print(cwp.prove())
 
 
 def combination_prover_demo():
     lexpr = Expression.fromstring
 
-    p1 = lexpr(r"see(Socrates, John)")
-    p2 = lexpr(r"see(John, Mary)")
-    c = lexpr(r"-see(Socrates, Mary)")
+    p1 = lexpr(r'see(Socrates, John)')
+    p2 = lexpr(r'see(John, Mary)')
+    c = lexpr(r'-see(Socrates, Mary)')
     prover = Prover9Command(c, [p1, p2])
     print(prover.prove())
     command = ClosedDomainProver(UniqueNamesProver(ClosedWorldProver(prover)))
@@ -503,32 +506,32 @@ def default_reasoning_demo():
     premises = []
 
     # define taxonomy
-    premises.append(lexpr(r"all x.(elephant(x)        -> animal(x))"))
-    premises.append(lexpr(r"all x.(bird(x)            -> animal(x))"))
-    premises.append(lexpr(r"all x.(dove(x)            -> bird(x))"))
-    premises.append(lexpr(r"all x.(ostrich(x)         -> bird(x))"))
-    premises.append(lexpr(r"all x.(flying_ostrich(x)  -> ostrich(x))"))
+    premises.append(lexpr(r'all x.(elephant(x)        -> animal(x))'))
+    premises.append(lexpr(r'all x.(bird(x)            -> animal(x))'))
+    premises.append(lexpr(r'all x.(dove(x)            -> bird(x))'))
+    premises.append(lexpr(r'all x.(ostrich(x)         -> bird(x))'))
+    premises.append(lexpr(r'all x.(flying_ostrich(x)  -> ostrich(x))'))
 
     # default properties
     premises.append(
-        lexpr(r"all x.((animal(x)  & -Ab1(x)) -> -fly(x))")
+        lexpr(r'all x.((animal(x)  & -Ab1(x)) -> -fly(x))')
     )  # normal animals don't fly
     premises.append(
-        lexpr(r"all x.((bird(x)    & -Ab2(x)) -> fly(x))")
+        lexpr(r'all x.((bird(x)    & -Ab2(x)) -> fly(x))')
     )  # normal birds fly
     premises.append(
-        lexpr(r"all x.((ostrich(x) & -Ab3(x)) -> -fly(x))")
+        lexpr(r'all x.((ostrich(x) & -Ab3(x)) -> -fly(x))')
     )  # normal ostriches don't fly
 
     # specify abnormal entities
-    premises.append(lexpr(r"all x.(bird(x)           -> Ab1(x))"))  # flight
-    premises.append(lexpr(r"all x.(ostrich(x)        -> Ab2(x))"))  # non-flying bird
-    premises.append(lexpr(r"all x.(flying_ostrich(x) -> Ab3(x))"))  # flying ostrich
+    premises.append(lexpr(r'all x.(bird(x)           -> Ab1(x))'))  # flight
+    premises.append(lexpr(r'all x.(ostrich(x)        -> Ab2(x))'))  # non-flying bird
+    premises.append(lexpr(r'all x.(flying_ostrich(x) -> Ab3(x))'))  # flying ostrich
 
     # define entities
-    premises.append(lexpr(r"elephant(E)"))
-    premises.append(lexpr(r"dove(D)"))
-    premises.append(lexpr(r"ostrich(O)"))
+    premises.append(lexpr(r'elephant(E)'))
+    premises.append(lexpr(r'dove(D)'))
+    premises.append(lexpr(r'ostrich(O)'))
 
     # print the assumptions
     prover = Prover9Command(None, premises)
@@ -536,9 +539,9 @@ def default_reasoning_demo():
     for a in command.assumptions():
         print(a)
 
-    print_proof("-fly(E)", premises)
-    print_proof("fly(D)", premises)
-    print_proof("-fly(O)", premises)
+    print_proof('-fly(E)', premises)
+    print_proof('fly(D)', premises)
+    print_proof('-fly(O)', premises)
 
 
 def print_proof(goal, premises):
@@ -556,5 +559,5 @@ def demo():
     default_reasoning_demo()
 
 
-if __name__ == "__main__":
+if __name__ == '__main__':
     demo()
index 5a76c34..3ac69fa 100644 (file)
@@ -1,6 +1,6 @@
 # Natural Language Toolkit: Interface to the Prover9 Theorem Prover
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Dan Garrette <dhgarrette@gmail.com>
 #         Ewan Klein <ewan@inf.ed.ac.uk>
 #
@@ -9,6 +9,7 @@
 """
 A theorem prover that makes use of the external 'Prover9' package.
 """
+from __future__ import print_function
 
 import os
 import subprocess
@@ -51,14 +52,14 @@ class Prover9CommandParent(object):
     and generating prover9-style input files from them.
     """
 
-    def print_assumptions(self, output_format="nltk"):
+    def print_assumptions(self, output_format='nltk'):
         """
         Print the list of the current assumptions.
         """
-        if output_format.lower() == "nltk":
+        if output_format.lower() == 'nltk':
             for a in self.assumptions():
                 print(a)
-        elif output_format.lower() == "prover9":
+        elif output_format.lower() == 'prover9':
             for a in convert_to_prover9(self.assumptions()):
                 print(a)
         else:
@@ -102,7 +103,7 @@ class Prover9Command(Prover9CommandParent, BaseProverCommand):
         :see BaseProverCommand.decorate_proof()
         """
         if simplify:
-            return self._prover._call_prooftrans(proof_string, ["striplabels"])[
+            return self._prover._call_prooftrans(proof_string, ['striplabels'])[
                 0
             ].rstrip()
         else:
@@ -123,13 +124,13 @@ class Prover9Parent(object):
             self._binary_location = None
             self._prover9_bin = None
         else:
-            name = "prover9"
+            name = 'prover9'
             self._prover9_bin = nltk.internals.find_binary(
                 name,
                 path_to_bin=binary_location,
-                env_vars=["PROVER9"],
-                url="http://www.cs.unm.edu/~mccune/prover9/",
-                binary_names=[name, name + ".exe"],
+                env_vars=['PROVER9'],
+                url='http://www.cs.unm.edu/~mccune/prover9/',
+                binary_names=[name, name + '.exe'],
                 verbose=verbose,
             )
             self._binary_location = self._prover9_bin.rsplit(os.path.sep, 1)
@@ -140,18 +141,18 @@ class Prover9Parent(object):
         prover9 binary.  This string is formed based on the goal,
         assumptions, and timeout value of this object.
         """
-        s = ""
+        s = ''
 
         if assumptions:
-            s += "formulas(assumptions).\n"
+            s += 'formulas(assumptions).\n'
             for p9_assumption in convert_to_prover9(assumptions):
-                s += "    %s.\n" % p9_assumption
-            s += "end_of_list.\n\n"
+                s += '    %s.\n' % p9_assumption
+            s += 'end_of_list.\n\n'
 
         if goal:
-            s += "formulas(goals).\n"
-            s += "    %s.\n" % convert_to_prover9(goal)
-            s += "end_of_list.\n\n"
+            s += 'formulas(goals).\n'
+            s += '    %s.\n' % convert_to_prover9(goal)
+            s += 'end_of_list.\n\n'
 
         return s
 
@@ -162,12 +163,12 @@ class Prover9Parent(object):
         for the prover9 executables.
         """
         return [
-            "/usr/local/bin/prover9",
-            "/usr/local/bin/prover9/bin",
-            "/usr/local/bin",
-            "/usr/bin",
-            "/usr/local/prover9",
-            "/usr/local/share/prover9",
+            '/usr/local/bin/prover9',
+            '/usr/local/bin/prover9/bin',
+            '/usr/local/bin',
+            '/usr/bin',
+            '/usr/local/prover9',
+            '/usr/local/share/prover9',
         ]
 
     def _find_binary(self, name, verbose=False):
@@ -177,9 +178,9 @@ class Prover9Parent(object):
         return nltk.internals.find_binary(
             name,
             searchpath=binary_locations,
-            env_vars=["PROVER9"],
-            url="http://www.cs.unm.edu/~mccune/prover9/",
-            binary_names=[name, name + ".exe"],
+            env_vars=['PROVER9'],
+            url='http://www.cs.unm.edu/~mccune/prover9/',
+            binary_names=[name, name + '.exe'],
             verbose=verbose,
         )
 
@@ -194,9 +195,9 @@ class Prover9Parent(object):
         :see: ``config_prover9``
         """
         if verbose:
-            print("Calling:", binary)
-            print("Args:", args)
-            print("Input:\n", input_str, "\n")
+            print('Calling:', binary)
+            print('Args:', args)
+            print('Input:\n', input_str, '\n')
 
         # Call prover9 via a subprocess
         cmd = [binary] + args
@@ -210,11 +211,11 @@ class Prover9Parent(object):
         (stdout, stderr) = p.communicate(input=input_str)
 
         if verbose:
-            print("Return code:", p.returncode)
+            print('Return code:', p.returncode)
             if stdout:
-                print("stdout:\n", stdout, "\n")
+                print('stdout:\n', stdout, '\n')
             if stderr:
-                print("stderr:\n", stderr, "\n")
+                print('stderr:\n', stderr, '\n')
 
         return (stdout.decode("utf-8"), p.returncode)
 
@@ -229,14 +230,14 @@ def convert_to_prover9(input):
             try:
                 result.append(_convert_to_prover9(s.simplify()))
             except:
-                print("input %s cannot be converted to Prover9 input syntax" % input)
+                print('input %s cannot be converted to Prover9 input syntax' % input)
                 raise
         return result
     else:
         try:
             return _convert_to_prover9(input.simplify())
         except:
-            print("input %s cannot be converted to Prover9 input syntax" % input)
+            print('input %s cannot be converted to Prover9 input syntax' % input)
             raise
 
 
@@ -246,59 +247,59 @@ def _convert_to_prover9(expression):
     """
     if isinstance(expression, ExistsExpression):
         return (
-            "exists "
+            'exists '
             + str(expression.variable)
-            + " "
+            + ' '
             + _convert_to_prover9(expression.term)
         )
     elif isinstance(expression, AllExpression):
         return (
-            "all "
+            'all '
             + str(expression.variable)
-            + " "
+            + ' '
             + _convert_to_prover9(expression.term)
         )
     elif isinstance(expression, NegatedExpression):
-        return "-(" + _convert_to_prover9(expression.term) + ")"
+        return '-(' + _convert_to_prover9(expression.term) + ')'
     elif isinstance(expression, AndExpression):
         return (
-            "("
+            '('
             + _convert_to_prover9(expression.first)
-            + " & "
+            + ' & '
             + _convert_to_prover9(expression.second)
-            + ")"
+            + ')'
         )
     elif isinstance(expression, OrExpression):
         return (
-            "("
+            '('
             + _convert_to_prover9(expression.first)
-            + " | "
+            + ' | '
             + _convert_to_prover9(expression.second)
-            + ")"
+            + ')'
         )
     elif isinstance(expression, ImpExpression):
         return (
-            "("
+            '('
             + _convert_to_prover9(expression.first)
-            + " -> "
+            + ' -> '
             + _convert_to_prover9(expression.second)
-            + ")"
+            + ')'
         )
     elif isinstance(expression, IffExpression):
         return (
-            "("
+            '('
             + _convert_to_prover9(expression.first)
-            + " <-> "
+            + ' <-> '
             + _convert_to_prover9(expression.second)
-            + ")"
+            + ')'
         )
     elif isinstance(expression, EqualityExpression):
         return (
-            "("
+            '('
             + _convert_to_prover9(expression.first)
-            + " = "
+            + ' = '
             + _convert_to_prover9(expression.second)
-            + ")"
+            + ')'
         )
     else:
         return str(expression)
@@ -333,7 +334,7 @@ class Prover9(Prover9Parent, Prover):
         """
         :see: Prover9Parent.prover9_input
         """
-        s = "clear(auto_denials).\n"  # only one proof required
+        s = 'clear(auto_denials).\n'  # only one proof required
         return s + Prover9Parent.prover9_input(self, goal, assumptions)
 
     def _call_prover9(self, input_str, args=[], verbose=False):
@@ -346,11 +347,11 @@ class Prover9(Prover9Parent, Prover):
         :see: ``config_prover9``
         """
         if self._prover9_bin is None:
-            self._prover9_bin = self._find_binary("prover9", verbose)
+            self._prover9_bin = self._find_binary('prover9', verbose)
 
-        updated_input_str = ""
+        updated_input_str = ''
         if self._timeout > 0:
-            updated_input_str += "assign(max_seconds, %d).\n\n" % self._timeout
+            updated_input_str += 'assign(max_seconds, %d).\n\n' % self._timeout
         updated_input_str += input_str
 
         stdout, returncode = self._call(
@@ -358,7 +359,7 @@ class Prover9(Prover9Parent, Prover):
         )
 
         if returncode not in [0, 2]:
-            errormsgprefix = "%%ERROR:"
+            errormsgprefix = '%%ERROR:'
             if errormsgprefix in stdout:
                 msgstart = stdout.index(errormsgprefix)
                 errormsg = stdout[msgstart:].strip()
@@ -381,7 +382,7 @@ class Prover9(Prover9Parent, Prover):
         :see: ``config_prover9``
         """
         if self._prooftrans_bin is None:
-            self._prooftrans_bin = self._find_binary("prooftrans", verbose)
+            self._prooftrans_bin = self._find_binary('prooftrans', verbose)
 
         return self._call(input_str, self._prooftrans_bin, args, verbose)
 
@@ -390,7 +391,7 @@ class Prover9Exception(Exception):
     def __init__(self, returncode, message):
         msg = p9_return_codes[returncode]
         if message:
-            msg += "\n%s" % message
+            msg += '\n%s' % message
         Exception.__init__(self, msg)
 
 
@@ -409,8 +410,8 @@ class Prover9LimitExceededException(Prover9Exception):
 
 def test_config():
 
-    a = Expression.fromstring("(walk(j) & sing(j))")
-    g = Expression.fromstring("walk(j)")
+    a = Expression.fromstring('(walk(j) & sing(j))')
+    g = Expression.fromstring('walk(j)')
     p = Prover9Command(g, assumptions=[a])
     p._executable_path = None
     p.prover9_search = []
@@ -438,56 +439,56 @@ def test_prove(arguments):
         alist = [Expression.fromstring(a) for a in assumptions]
         p = Prover9Command(g, assumptions=alist).prove()
         for a in alist:
-            print("   %s" % a)
-        print("|- %s: %s\n" % (g, p))
+            print('   %s' % a)
+        print('|- %s: %s\n' % (g, p))
 
 
 arguments = [
-    ("(man(x) <-> (not (not man(x))))", []),
-    ("(not (man(x) & (not man(x))))", []),
-    ("(man(x) | (not man(x)))", []),
-    ("(man(x) & (not man(x)))", []),
-    ("(man(x) -> man(x))", []),
-    ("(not (man(x) & (not man(x))))", []),
-    ("(man(x) | (not man(x)))", []),
-    ("(man(x) -> man(x))", []),
-    ("(man(x) <-> man(x))", []),
-    ("(not (man(x) <-> (not man(x))))", []),
-    ("mortal(Socrates)", ["all x.(man(x) -> mortal(x))", "man(Socrates)"]),
-    ("((all x.(man(x) -> walks(x)) & man(Socrates)) -> some y.walks(y))", []),
-    ("(all x.man(x) -> all x.man(x))", []),
-    ("some x.all y.sees(x,y)", []),
+    ('(man(x) <-> (not (not man(x))))', []),
+    ('(not (man(x) & (not man(x))))', []),
+    ('(man(x) | (not man(x)))', []),
+    ('(man(x) & (not man(x)))', []),
+    ('(man(x) -> man(x))', []),
+    ('(not (man(x) & (not man(x))))', []),
+    ('(man(x) | (not man(x)))', []),
+    ('(man(x) -> man(x))', []),
+    ('(man(x) <-> man(x))', []),
+    ('(not (man(x) <-> (not man(x))))', []),
+    ('mortal(Socrates)', ['all x.(man(x) -> mortal(x))', 'man(Socrates)']),
+    ('((all x.(man(x) -> walks(x)) & man(Socrates)) -> some y.walks(y))', []),
+    ('(all x.man(x) -> all x.man(x))', []),
+    ('some x.all y.sees(x,y)', []),
     (
-        "some e3.(walk(e3) & subj(e3, mary))",
+        'some e3.(walk(e3) & subj(e3, mary))',
         [
-            "some e1.(see(e1) & subj(e1, john) & some e2.(pred(e1, e2) & walk(e2) & subj(e2, mary)))"
+            'some e1.(see(e1) & subj(e1, john) & some e2.(pred(e1, e2) & walk(e2) & subj(e2, mary)))'
         ],
     ),
     (
-        "some x e1.(see(e1) & subj(e1, x) & some e2.(pred(e1, e2) & walk(e2) & subj(e2, mary)))",
+        'some x e1.(see(e1) & subj(e1, x) & some e2.(pred(e1, e2) & walk(e2) & subj(e2, mary)))',
         [
-            "some e1.(see(e1) & subj(e1, john) & some e2.(pred(e1, e2) & walk(e2) & subj(e2, mary)))"
+            'some e1.(see(e1) & subj(e1, john) & some e2.(pred(e1, e2) & walk(e2) & subj(e2, mary)))'
         ],
     ),
 ]
 
 expressions = [
-    r"some x y.sees(x,y)",
-    r"some x.(man(x) & walks(x))",
-    r"\x.(man(x) & walks(x))",
-    r"\x y.sees(x,y)",
-    r"walks(john)",
-    r"\x.big(x, \y.mouse(y))",
-    r"(walks(x) & (runs(x) & (threes(x) & fours(x))))",
-    r"(walks(x) -> runs(x))",
-    r"some x.(PRO(x) & sees(John, x))",
-    r"some x.(man(x) & (not walks(x)))",
-    r"all x.(man(x) -> walks(x))",
+    r'some x y.sees(x,y)',
+    r'some x.(man(x) & walks(x))',
+    r'\x.(man(x) & walks(x))',
+    r'\x y.sees(x,y)',
+    r'walks(john)',
+    r'\x.big(x, \y.mouse(y))',
+    r'(walks(x) & (runs(x) & (threes(x) & fours(x))))',
+    r'(walks(x) -> runs(x))',
+    r'some x.(PRO(x) & sees(John, x))',
+    r'some x.(man(x) & (not walks(x)))',
+    r'all x.(man(x) -> walks(x))',
 ]
 
 
 def spacer(num=45):
-    print("-" * num)
+    print('-' * num)
 
 
 def demo():
@@ -504,5 +505,5 @@ def demo():
     test_prove(arguments)
 
 
-if __name__ == "__main__":
+if __name__ == '__main__':
     demo()
index df19776..06761a9 100644 (file)
@@ -2,13 +2,14 @@
 #
 # Author: Dan Garrette <dhgarrette@gmail.com>
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # URL: <http://nltk.org>
 # For license information, see LICENSE.TXT
 
 """
 Module for a resolution-based First Order theorem prover.
 """
+from __future__ import print_function, unicode_literals
 
 import operator
 from collections import defaultdict
@@ -31,6 +32,7 @@ from nltk.sem.logic import (
 )
 
 from nltk.inference.api import Prover, BaseProverCommand
+from nltk.compat import python_2_unicode_compatible
 
 
 class ProverParseError(Exception):
@@ -38,7 +40,7 @@ class ProverParseError(Exception):
 
 
 class ResolutionProver(Prover):
-    ANSWER_KEY = "ANSWER"
+    ANSWER_KEY = 'ANSWER'
     _assume_false = True
 
     def _prove(self, goal=None, assumptions=None, verbose=False):
@@ -63,7 +65,7 @@ class ResolutionProver(Prover):
                 print(ResolutionProverCommand._decorate_clauses(clauses))
         except RuntimeError as e:
             if self._assume_false and str(e).startswith(
-                "maximum recursion depth exceeded"
+                'maximum recursion depth exceeded'
             ):
                 result = False
                 clauses = []
@@ -157,22 +159,23 @@ class ResolutionProverCommand(BaseProverCommand):
         """
         Decorate the proof output.
         """
-        out = ""
+        out = ''
         max_clause_len = max([len(str(clause)) for clause in clauses])
         max_seq_len = len(str(len(clauses)))
         for i in range(len(clauses)):
-            parents = "A"
-            taut = ""
+            parents = 'A'
+            taut = ''
             if clauses[i].is_tautology():
-                taut = "Tautology"
+                taut = 'Tautology'
             if clauses[i]._parents:
                 parents = str(clauses[i]._parents)
-            parents = " " * (max_clause_len - len(str(clauses[i])) + 1) + parents
-            seq = " " * (max_seq_len - len(str(i + 1))) + str(i + 1)
-            out += "[%s] %s %s %s\n" % (seq, clauses[i], parents, taut)
+            parents = ' ' * (max_clause_len - len(str(clauses[i])) + 1) + parents
+            seq = ' ' * (max_seq_len - len(str(i + 1))) + str(i + 1)
+            out += '[%s] %s %s %s\n' % (seq, clauses[i], parents, taut)
         return out
 
 
+@python_2_unicode_compatible
 class Clause(list):
     def __init__(self, data):
         list.__init__(self, data)
@@ -333,7 +336,7 @@ class Clause(list):
         return Clause([atom.substitute_bindings(bindings) for atom in self])
 
     def __str__(self):
-        return "{" + ", ".join("%s" % item for item in self) + "}"
+        return '{' + ', '.join("%s" % item for item in self) + '}'
 
     def __repr__(self):
         return "%s" % self
@@ -343,7 +346,7 @@ def _iterate_first(first, second, bindings, used, skipped, finalize_method, debu
     """
     This method facilitates movement through the terms of 'self'
     """
-    debug.line("unify(%s,%s) %s" % (first, second, bindings))
+    debug.line('unify(%s,%s) %s' % (first, second, bindings))
 
     if not len(first) or not len(second):  # if no more recursions can be performed
         return finalize_method(first, second, bindings, used, skipped, debug)
@@ -387,7 +390,7 @@ def _iterate_second(first, second, bindings, used, skipped, finalize_method, deb
     """
     This method facilitates movement through the terms of 'other'
     """
-    debug.line("unify(%s,%s) %s" % (first, second, bindings))
+    debug.line('unify(%s,%s) %s' % (first, second, bindings))
 
     if not len(first) or not len(second):  # if no more recursions can be performed
         return finalize_method(first, second, bindings, used, skipped, debug)
@@ -471,10 +474,10 @@ def _unify_terms(a, b, bindings=None, used=None):
 def _complete_unify_path(first, second, bindings, used, skipped, debug):
     if used[0] or used[1]:  # if bindings were made along the path
         newclause = Clause(skipped[0] + skipped[1] + first + second)
-        debug.line("  -> New Clause: %s" % newclause)
+        debug.line('  -> New Clause: %s' % newclause)
         return [newclause.substitute_bindings(bindings)]
     else:  # no bindings made means no unification occurred.  so no result
-        debug.line("  -> End")
+        debug.line('  -> End')
         return []
 
 
@@ -528,6 +531,7 @@ def _clausify(expression):
     raise ProverParseError()
 
 
+@python_2_unicode_compatible
 class BindingDict(object):
     def __init__(self, binding_list=None):
         """
@@ -571,11 +575,11 @@ class BindingDict(object):
                 self.d[binding.variable] = binding2
             else:
                 raise BindingException(
-                    "Variable %s already bound to another " "value" % (variable)
+                    'Variable %s already bound to another ' 'value' % (variable)
                 )
         else:
             raise BindingException(
-                "Variable %s already bound to another " "value" % (variable)
+                'Variable %s already bound to another ' 'value' % (variable)
             )
 
     def __getitem__(self, variable):
@@ -617,8 +621,8 @@ class BindingDict(object):
         return len(self.d)
 
     def __str__(self):
-        data_str = ", ".join("%s: %s" % (v, self.d[v]) for v in sorted(self.d.keys()))
-        return "{" + data_str + "}"
+        data_str = ', '.join('%s: %s' % (v, self.d[v]) for v in sorted(self.d.keys()))
+        return '{' + data_str + '}'
 
     def __repr__(self):
         return "%s" % self
@@ -681,70 +685,70 @@ class DebugObject(object):
 
     def line(self, line):
         if self.enabled:
-            print("    " * self.indent + line)
+            print('    ' * self.indent + line)
 
 
 def testResolutionProver():
-    resolution_test(r"man(x)")
-    resolution_test(r"(man(x) -> man(x))")
-    resolution_test(r"(man(x) -> --man(x))")
-    resolution_test(r"-(man(x) and -man(x))")
-    resolution_test(r"(man(x) or -man(x))")
-    resolution_test(r"(man(x) -> man(x))")
-    resolution_test(r"-(man(x) and -man(x))")
-    resolution_test(r"(man(x) or -man(x))")
-    resolution_test(r"(man(x) -> man(x))")
-    resolution_test(r"(man(x) iff man(x))")
-    resolution_test(r"-(man(x) iff -man(x))")
-    resolution_test("all x.man(x)")
-    resolution_test("-all x.some y.F(x,y) & some x.all y.(-F(x,y))")
-    resolution_test("some x.all y.sees(x,y)")
-
-    p1 = Expression.fromstring(r"all x.(man(x) -> mortal(x))")
-    p2 = Expression.fromstring(r"man(Socrates)")
-    c = Expression.fromstring(r"mortal(Socrates)")
-    print("%s, %s |- %s: %s" % (p1, p2, c, ResolutionProver().prove(c, [p1, p2])))
-
-    p1 = Expression.fromstring(r"all x.(man(x) -> walks(x))")
-    p2 = Expression.fromstring(r"man(John)")
-    c = Expression.fromstring(r"some y.walks(y)")
-    print("%s, %s |- %s: %s" % (p1, p2, c, ResolutionProver().prove(c, [p1, p2])))
-
-    p = Expression.fromstring(r"some e1.some e2.(believe(e1,john,e2) & walk(e2,mary))")
-    c = Expression.fromstring(r"some e0.walk(e0,mary)")
-    print("%s |- %s: %s" % (p, c, ResolutionProver().prove(c, [p])))
+    resolution_test(r'man(x)')
+    resolution_test(r'(man(x) -> man(x))')
+    resolution_test(r'(man(x) -> --man(x))')
+    resolution_test(r'-(man(x) and -man(x))')
+    resolution_test(r'(man(x) or -man(x))')
+    resolution_test(r'(man(x) -> man(x))')
+    resolution_test(r'-(man(x) and -man(x))')
+    resolution_test(r'(man(x) or -man(x))')
+    resolution_test(r'(man(x) -> man(x))')
+    resolution_test(r'(man(x) iff man(x))')
+    resolution_test(r'-(man(x) iff -man(x))')
+    resolution_test('all x.man(x)')
+    resolution_test('-all x.some y.F(x,y) & some x.all y.(-F(x,y))')
+    resolution_test('some x.all y.sees(x,y)')
+
+    p1 = Expression.fromstring(r'all x.(man(x) -> mortal(x))')
+    p2 = Expression.fromstring(r'man(Socrates)')
+    c = Expression.fromstring(r'mortal(Socrates)')
+    print('%s, %s |- %s: %s' % (p1, p2, c, ResolutionProver().prove(c, [p1, p2])))
+
+    p1 = Expression.fromstring(r'all x.(man(x) -> walks(x))')
+    p2 = Expression.fromstring(r'man(John)')
+    c = Expression.fromstring(r'some y.walks(y)')
+    print('%s, %s |- %s: %s' % (p1, p2, c, ResolutionProver().prove(c, [p1, p2])))
+
+    p = Expression.fromstring(r'some e1.some e2.(believe(e1,john,e2) & walk(e2,mary))')
+    c = Expression.fromstring(r'some e0.walk(e0,mary)')
+    print('%s |- %s: %s' % (p, c, ResolutionProver().prove(c, [p])))
 
 
 def resolution_test(e):
     f = Expression.fromstring(e)
     t = ResolutionProver().prove(f)
-    print("|- %s: %s" % (f, t))
+    print('|- %s: %s' % (f, t))
 
 
 def test_clausify():
     lexpr = Expression.fromstring
 
-    print(clausify(lexpr("P(x) | Q(x)")))
-    print(clausify(lexpr("(P(x) & Q(x)) | R(x)")))
-    print(clausify(lexpr("P(x) | (Q(x) & R(x))")))
-    print(clausify(lexpr("(P(x) & Q(x)) | (R(x) & S(x))")))
+    print(clausify(lexpr('P(x) | Q(x)')))
+    print(clausify(lexpr('(P(x) & Q(x)) | R(x)')))
+    print(clausify(lexpr('P(x) | (Q(x) & R(x))')))
+    print(clausify(lexpr('(P(x) & Q(x)) | (R(x) & S(x))')))
 
-    print(clausify(lexpr("P(x) | Q(x) | R(x)")))
-    print(clausify(lexpr("P(x) | (Q(x) & R(x)) | S(x)")))
+    print(clausify(lexpr('P(x) | Q(x) | R(x)')))
+    print(clausify(lexpr('P(x) | (Q(x) & R(x)) | S(x)')))
 
-    print(clausify(lexpr("exists x.P(x) | Q(x)")))
+    print(clausify(lexpr('exists x.P(x) | Q(x)')))
 
-    print(clausify(lexpr("-(-P(x) & Q(x))")))
-    print(clausify(lexpr("P(x) <-> Q(x)")))
-    print(clausify(lexpr("-(P(x) <-> Q(x))")))
-    print(clausify(lexpr("-(all x.P(x))")))
-    print(clausify(lexpr("-(some x.P(x))")))
+    print(clausify(lexpr('-(-P(x) & Q(x))')))
+    print(clausify(lexpr('P(x) <-> Q(x)')))
+    print(clausify(lexpr('-(P(x) <-> Q(x))')))
+    print(clausify(lexpr('-(all x.P(x))')))
+    print(clausify(lexpr('-(some x.P(x))')))
 
-    print(clausify(lexpr("some x.P(x)")))
-    print(clausify(lexpr("some x.all y.P(x,y)")))
-    print(clausify(lexpr("all y.some x.P(x,y)")))
-    print(clausify(lexpr("all z.all y.some x.P(x,y,z)")))
-    print(clausify(lexpr("all x.(all y.P(x,y) -> -all y.(Q(x,y) -> R(x,y)))")))
+    print(clausify(lexpr('some x.P(x)')))
+    print(clausify(lexpr('some x.all y.P(x,y)')))
+    print(clausify(lexpr('all y.some x.P(x,y)')))
+    print(clausify(lexpr('all z.all y.some x.P(x,y,z)')))
+    print(clausify(lexpr('all x.(all y.P(x,y) -> -all y.(Q(x,y) -> R(x,y)))')))
 
 
 def demo():
@@ -753,9 +757,9 @@ def demo():
     testResolutionProver()
     print()
 
-    p = Expression.fromstring("man(x)")
+    p = Expression.fromstring('man(x)')
     print(ResolutionProverCommand(p, [p]).prove())
 
 
-if __name__ == "__main__":
+if __name__ == '__main__':
     demo()
index 90c9725..e8cc840 100644 (file)
@@ -1,6 +1,6 @@
 # Natural Language Toolkit: First-Order Tableau Theorem Prover
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Dan Garrette <dhgarrette@gmail.com>
 #
 # URL: <http://nltk.org/>
@@ -9,6 +9,7 @@
 """
 Module for a tableau-based First Order theorem prover.
 """
+from __future__ import print_function, unicode_literals
 
 from nltk.internals import Counter
 
@@ -57,7 +58,7 @@ class TableauProver(Prover):
             result = self._attempt_proof(agenda, set(), set(), debugger)
         except RuntimeError as e:
             if self._assume_false and str(e).startswith(
-                "maximum recursion depth exceeded"
+                'maximum recursion depth exceeded'
             ):
                 result = False
             else:
@@ -65,14 +66,14 @@ class TableauProver(Prover):
                     print(e)
                 else:
                     raise e
-        return (result, "\n".join(debugger.lines))
+        return (result, '\n'.join(debugger.lines))
 
     def _attempt_proof(self, agenda, accessible_vars, atoms, debug):
         (current, context), category = agenda.pop_first()
 
         # if there's nothing left in the agenda, and we haven't closed the path
         if not current:
-            debug.line("AGENDA EMPTY")
+            debug.line('AGENDA EMPTY')
             return False
 
         proof_method = {
@@ -107,7 +108,7 @@ class TableauProver(Prover):
     ):
         # Check if the branch is closed.  Return 'True' if it is
         if (current, True) in atoms:
-            debug.line("CLOSED", 1)
+            debug.line('CLOSED', 1)
             return True
 
         if context:
@@ -130,7 +131,7 @@ class TableauProver(Prover):
     ):
         # Check if the branch is closed.  Return 'True' if it is
         if (current.term, False) in atoms:
-            debug.line("CLOSED", 1)
+            debug.line('CLOSED', 1)
             return True
 
         if context:
@@ -153,7 +154,7 @@ class TableauProver(Prover):
     ):
         # Check if the branch is closed.  Return 'True' if it is
         if (current, True) in atoms:
-            debug.line("CLOSED", 1)
+            debug.line('CLOSED', 1)
             return True
 
         # mark all AllExpressions as 'not exhausted' into the agenda since we are (potentially) adding new accessible vars
@@ -167,7 +168,7 @@ class TableauProver(Prover):
     ):
         # Check if the branch is closed.  Return 'True' if it is
         if (current.term, False) in atoms:
-            debug.line("CLOSED", 1)
+            debug.line('CLOSED', 1)
             return True
 
         # mark all AllExpressions as 'not exhausted' into the agenda since we are (potentially) adding new accessible vars
@@ -183,7 +184,7 @@ class TableauProver(Prover):
         for i, arg in enumerate(args):
             if not TableauProver.is_atom(arg):
                 ctx = f
-                nv = Variable("X%s" % _counter.get())
+                nv = Variable('X%s' % _counter.get())
                 for j, a in enumerate(args):
                     ctx = ctx(VariableExpression(nv)) if i == j else ctx(a)
                 if context:
@@ -191,7 +192,7 @@ class TableauProver(Prover):
                 ctx = LambdaExpression(nv, ctx)
                 agenda.put(arg, ctx)
                 return self._attempt_proof(agenda, accessible_vars, atoms, debug + 1)
-        raise Exception("If this method is called, there must be a non-atomic argument")
+        raise Exception('If this method is called, there must be a non-atomic argument')
 
     def _attempt_proof_n_app(
         self, current, context, agenda, accessible_vars, atoms, debug
@@ -200,7 +201,7 @@ class TableauProver(Prover):
         for i, arg in enumerate(args):
             if not TableauProver.is_atom(arg):
                 ctx = f
-                nv = Variable("X%s" % _counter.get())
+                nv = Variable('X%s' % _counter.get())
                 for j, a in enumerate(args):
                     ctx = ctx(VariableExpression(nv)) if i == j else ctx(a)
                 if context:
@@ -209,7 +210,7 @@ class TableauProver(Prover):
                 ctx = LambdaExpression(nv, -ctx)
                 agenda.put(-arg, ctx)
                 return self._attempt_proof(agenda, accessible_vars, atoms, debug + 1)
-        raise Exception("If this method is called, there must be a non-atomic argument")
+        raise Exception('If this method is called, there must be a non-atomic argument')
 
     def _attempt_proof_n_eq(
         self, current, context, agenda, accessible_vars, atoms, debug
@@ -218,7 +219,7 @@ class TableauProver(Prover):
         # Since 'current' is of type '~(a=b)', the path is closed if 'a' == 'b'
         ###########################################################################
         if current.term.first == current.term.second:
-            debug.line("CLOSED", 1)
+            debug.line('CLOSED', 1)
             return True
 
         agenda[Categories.N_EQ].add((current, context))
@@ -365,7 +366,7 @@ class TableauProver(Prover):
 
             if bv_available:
                 variable_to_use = list(bv_available)[0]
-                debug.line("--> Using '%s'" % variable_to_use, 2)
+                debug.line('--> Using \'%s\'' % variable_to_use, 2)
                 current._used_vars |= set([variable_to_use])
                 agenda.put(
                     current.term.replace(current.variable, variable_to_use), context
@@ -375,14 +376,14 @@ class TableauProver(Prover):
 
             else:
                 # no more available variables to substitute
-                debug.line("--> Variables Exhausted", 2)
+                debug.line('--> Variables Exhausted', 2)
                 current._exhausted = True
                 agenda[Categories.ALL].add((current, context))
                 return self._attempt_proof(agenda, accessible_vars, atoms, debug + 1)
 
         else:
             new_unique_variable = VariableExpression(unique_variable())
-            debug.line("--> Using '%s'" % new_unique_variable, 2)
+            debug.line('--> Using \'%s\'' % new_unique_variable, 2)
             current._used_vars |= set([new_unique_variable])
             agenda.put(
                 current.term.replace(current.variable, new_unique_variable), context
@@ -582,20 +583,20 @@ class Debug(object):
         if isinstance(data, tuple):
             ex, ctx = data
             if ctx:
-                data = "%s, %s" % (ex, ctx)
+                data = '%s, %s' % (ex, ctx)
             else:
-                data = "%s" % ex
+                data = '%s' % ex
 
             if isinstance(ex, AllExpression):
                 try:
                     used_vars = "[%s]" % (
                         ",".join("%s" % ve.variable.name for ve in ex._used_vars)
                     )
-                    data += ":   %s" % used_vars
+                    data += ':   %s' % used_vars
                 except AttributeError:
-                    data += ":   []"
+                    data += ':   []'
 
-        newline = "%s%s" % ("   " * (self.indent + indent), data)
+        newline = '%s%s' % ('   ' * (self.indent + indent), data)
         self.lines.append(newline)
 
         if self.verbose:
@@ -627,49 +628,49 @@ class Categories(object):
 
 
 def testTableauProver():
-    tableau_test("P | -P")
-    tableau_test("P & -P")
-    tableau_test("Q", ["P", "(P -> Q)"])
-    tableau_test("man(x)")
-    tableau_test("(man(x) -> man(x))")
-    tableau_test("(man(x) -> --man(x))")
-    tableau_test("-(man(x) and -man(x))")
-    tableau_test("(man(x) or -man(x))")
-    tableau_test("(man(x) -> man(x))")
-    tableau_test("-(man(x) and -man(x))")
-    tableau_test("(man(x) or -man(x))")
-    tableau_test("(man(x) -> man(x))")
-    tableau_test("(man(x) iff man(x))")
-    tableau_test("-(man(x) iff -man(x))")
-    tableau_test("all x.man(x)")
-    tableau_test("all x.all y.((x = y) -> (y = x))")
-    tableau_test("all x.all y.all z.(((x = y) & (y = z)) -> (x = z))")
+    tableau_test('P | -P')
+    tableau_test('P & -P')
+    tableau_test('Q', ['P', '(P -> Q)'])
+    tableau_test('man(x)')
+    tableau_test('(man(x) -> man(x))')
+    tableau_test('(man(x) -> --man(x))')
+    tableau_test('-(man(x) and -man(x))')
+    tableau_test('(man(x) or -man(x))')
+    tableau_test('(man(x) -> man(x))')
+    tableau_test('-(man(x) and -man(x))')
+    tableau_test('(man(x) or -man(x))')
+    tableau_test('(man(x) -> man(x))')
+    tableau_test('(man(x) iff man(x))')
+    tableau_test('-(man(x) iff -man(x))')
+    tableau_test('all x.man(x)')
+    tableau_test('all x.all y.((x = y) -> (y = x))')
+    tableau_test('all x.all y.all z.(((x = y) & (y = z)) -> (x = z))')
     #    tableau_test('-all x.some y.F(x,y) & some x.all y.(-F(x,y))')
     #    tableau_test('some x.all y.sees(x,y)')
 
-    p1 = "all x.(man(x) -> mortal(x))"
-    p2 = "man(Socrates)"
-    c = "mortal(Socrates)"
+    p1 = 'all x.(man(x) -> mortal(x))'
+    p2 = 'man(Socrates)'
+    c = 'mortal(Socrates)'
     tableau_test(c, [p1, p2])
 
-    p1 = "all x.(man(x) -> walks(x))"
-    p2 = "man(John)"
-    c = "some y.walks(y)"
+    p1 = 'all x.(man(x) -> walks(x))'
+    p2 = 'man(John)'
+    c = 'some y.walks(y)'
     tableau_test(c, [p1, p2])
 
-    p = "((x = y) & walks(y))"
-    c = "walks(x)"
+    p = '((x = y) & walks(y))'
+    c = 'walks(x)'
     tableau_test(c, [p])
 
-    p = "((x = y) & ((y = z) & (z = w)))"
-    c = "(x = w)"
+    p = '((x = y) & ((y = z) & (z = w)))'
+    c = '(x = w)'
     tableau_test(c, [p])
 
-    p = "some e1.some e2.(believe(e1,john,e2) & walk(e2,mary))"
-    c = "some e0.walk(e0,mary)"
+    p = 'some e1.some e2.(believe(e1,john,e2) & walk(e2,mary))'
+    c = 'some e0.walk(e0,mary)'
     tableau_test(c, [p])
 
-    c = "(exists x.exists z3.((x = Mary) & ((z3 = John) & sees(z3,x))) <-> exists x.exists z4.((x = John) & ((z4 = Mary) & sees(x,z4))))"
+    c = '(exists x.exists z3.((x = Mary) & ((z3 = John) & sees(z3,x))) <-> exists x.exists z4.((x = John) & ((z4 = Mary) & sees(x,z4))))'
     tableau_test(c)
 
 
@@ -679,19 +680,19 @@ def testTableauProver():
 
 
 def testHigherOrderTableauProver():
-    tableau_test("believe(j, -lie(b))", ["believe(j, -lie(b) & -cheat(b))"])
-    tableau_test("believe(j, lie(b) & cheat(b))", ["believe(j, lie(b))"])
+    tableau_test('believe(j, -lie(b))', ['believe(j, -lie(b) & -cheat(b))'])
+    tableau_test('believe(j, lie(b) & cheat(b))', ['believe(j, lie(b))'])
     tableau_test(
-        "believe(j, lie(b))", ["lie(b)"]
+        'believe(j, lie(b))', ['lie(b)']
     )  # how do we capture that John believes all things that are true
     tableau_test(
-        "believe(j, know(b, cheat(b)))",
-        ["believe(j, know(b, lie(b)) & know(b, steals(b) & cheat(b)))"],
+        'believe(j, know(b, cheat(b)))',
+        ['believe(j, know(b, lie(b)) & know(b, steals(b) & cheat(b)))'],
     )
-    tableau_test("P(Q(y), R(y) & R(z))", ["P(Q(x) & Q(y), R(y) & R(z))"])
+    tableau_test('P(Q(y), R(y) & R(z))', ['P(Q(x) & Q(y), R(y) & R(z))'])
 
-    tableau_test("believe(j, cheat(b) & lie(b))", ["believe(j, lie(b) & cheat(b))"])
-    tableau_test("believe(j, -cheat(b) & -lie(b))", ["believe(j, -lie(b) & -cheat(b))"])
+    tableau_test('believe(j, cheat(b) & lie(b))', ['believe(j, lie(b) & cheat(b))'])
+    tableau_test('believe(j, -cheat(b) & -lie(b))', ['believe(j, -lie(b) & -cheat(b))'])
 
 
 def tableau_test(c, ps=None, verbose=False):
@@ -700,8 +701,8 @@ def tableau_test(c, ps=None, verbose=False):
     if not ps:
         ps = []
     print(
-        "%s |- %s: %s"
-        % (", ".join(ps), pc, TableauProver().prove(pc, pps, verbose=verbose))
+        '%s |- %s: %s'
+        % (', '.join(ps), pc, TableauProver().prove(pc, pps, verbose=verbose))
     )
 
 
@@ -710,5 +711,5 @@ def demo():
     testHigherOrderTableauProver()
 
 
-if __name__ == "__main__":
+if __name__ == '__main__':
     demo()
index ac93c8a..01d0f48 100644 (file)
@@ -1,11 +1,12 @@
 # Natural Language Toolkit: Internal utility functions
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Steven Bird <stevenbird1@gmail.com>
 #         Edward Loper <edloper@gmail.com>
 #         Nitin Madnani <nmadnani@ets.org>
 # URL: <http://nltk.org/>
 # For license information, see LICENSE.TXT
+from __future__ import print_function
 
 import subprocess
 import os
@@ -17,7 +18,16 @@ import types
 import sys
 import stat
 import locale
-from xml.etree import ElementTree
+
+# Use the c version of ElementTree, which is faster, if possible:
+try:
+    from xml.etree import cElementTree as ElementTree
+except ImportError:
+    from xml.etree import ElementTree
+
+from six import string_types
+
+from nltk import compat
 
 ##########################################################################
 # Java Via Command-Line
@@ -45,15 +55,15 @@ def config_java(bin=None, options=None, verbose=False):
     """
     global _java_bin, _java_options
     _java_bin = find_binary(
-        "java",
+        'java',
         bin,
-        env_vars=["JAVAHOME", "JAVA_HOME"],
+        env_vars=['JAVAHOME', 'JAVA_HOME'],
         verbose=verbose,
-        binary_names=["java.exe"],
+        binary_names=['java.exe'],
     )
 
     if options is not None:
-        if isinstance(options, str):
+        if isinstance(options, string_types):
             options = options.split()
         _java_options = list(options)
 
@@ -78,7 +88,7 @@ def java(cmd, classpath=None, stdin=None, stdout=None, stderr=None, blocking=Tru
         standard input, standard output and standard error file
         handles, respectively.  Valid values are ``subprocess.PIPE``,
         an existing file descriptor (a positive integer), an existing
-        file object, 'pipe', 'stdout', 'devnull' and None.  ``subprocess.PIPE`` indicates that a
+        file object, and None.  ``subprocess.PIPE`` indicates that a
         new pipe to the child should be created.  With None, no
         redirection will occur; the child's file handles will be
         inherited from the parent.  Additionally, stderr can be
@@ -98,26 +108,21 @@ def java(cmd, classpath=None, stdin=None, stdout=None, stderr=None, blocking=Tru
 
     :raise OSError: If the java command returns a nonzero return code.
     """
-
-    subprocess_output_dict = {
-        "pipe": subprocess.PIPE,
-        "stdout": subprocess.STDOUT,
-        "devnull": subprocess.DEVNULL,
-    }
-
-    stdin = subprocess_output_dict.get(stdin, stdin)
-    stdout = subprocess_output_dict.get(stdout, stdout)
-    stderr = subprocess_output_dict.get(stderr, stderr)
-
-    if isinstance(cmd, str):
-        raise TypeError("cmd should be a list of strings")
+    if stdin == 'pipe':
+        stdin = subprocess.PIPE
+    if stdout == 'pipe':
+        stdout = subprocess.PIPE
+    if stderr == 'pipe':
+        stderr = subprocess.PIPE
+    if isinstance(cmd, string_types):
+        raise TypeError('cmd should be a list of strings')
 
     # Make sure we know where a java binary is.
     if _java_bin is None:
         config_java()
 
     # Set up the classpath.
-    if isinstance(classpath, str):
+    if isinstance(classpath, string_types):
         classpaths = [classpath]
     else:
         classpaths = list(classpath)
@@ -125,7 +130,7 @@ def java(cmd, classpath=None, stdin=None, stdout=None, stderr=None, blocking=Tru
 
     # Construct the full command string.
     cmd = list(cmd)
-    cmd = ["-cp", classpath] + cmd
+    cmd = ['-cp', classpath] + cmd
     cmd = [_java_bin] + _java_options + cmd
 
     # Call java via a subprocess
@@ -137,7 +142,7 @@ def java(cmd, classpath=None, stdin=None, stdout=None, stderr=None, blocking=Tru
     # Check the return code.
     if p.returncode != 0:
         print(_decode_stdoutdata(stderr))
-        raise OSError("Java command failed : " + str(cmd))
+        raise OSError('Java command failed : ' + str(cmd))
 
     return (stdout, stderr)
 
@@ -151,15 +156,15 @@ if 0:
     # Read:
     (a, b) = java(
         [
-            "weka.classifiers.bayes.NaiveBayes",
-            "-l",
-            "/tmp/names.model",
-            "-T",
-            "/tmp/test.arff",
-            "-p",
-            "0",
+            'weka.classifiers.bayes.NaiveBayes',
+            '-l',
+            '/tmp/names.model',
+            '-T',
+            '/tmp/test.arff',
+            '-p',
+            '0',
         ],  # , '-distribution'],
-        classpath="/Users/edloper/Desktop/weka/weka.jar",
+        classpath='/Users/edloper/Desktop/weka/weka.jar',
     )
 
 
@@ -181,7 +186,7 @@ class ReadError(ValueError):
         self.position = position
 
     def __str__(self):
-        return "Expected %s at %s" % (self.expected, self.position)
+        return 'Expected %s at %s' % (self.expected, self.position)
 
 
 _STRING_START_RE = re.compile(r"[uU]?[rR]?(\"\"\"|\'\'\'|\"|\')")
@@ -222,17 +227,17 @@ def read_str(s, start_position):
     # Read the open quote, and any modifiers.
     m = _STRING_START_RE.match(s, start_position)
     if not m:
-        raise ReadError("open quote", start_position)
+        raise ReadError('open quote', start_position)
     quotemark = m.group(1)
 
     # Find the close quote.
-    _STRING_END_RE = re.compile(r"\\|%s" % quotemark)
+    _STRING_END_RE = re.compile(r'\\|%s' % quotemark)
     position = m.end()
     while True:
         match = _STRING_END_RE.search(s, position)
         if not match:
-            raise ReadError("close quote", position)
-        if match.group(0) == "\\":
+            raise ReadError('close quote', position)
+        if match.group(0) == '\\':
             position = match.end() + 1
         else:
             break
@@ -242,10 +247,10 @@ def read_str(s, start_position):
     try:
         return eval(s[start_position : match.end()]), match.end()
     except ValueError as e:
-        raise ReadError("invalid string (%s)" % e)
+        raise ReadError('invalid string (%s)' % e)
 
 
-_READ_INT_RE = re.compile(r"-?\d+")
+_READ_INT_RE = re.compile(r'-?\d+')
 
 
 def read_int(s, start_position):
@@ -278,11 +283,11 @@ def read_int(s, start_position):
     """
     m = _READ_INT_RE.match(s, start_position)
     if not m:
-        raise ReadError("integer", start_position)
+        raise ReadError('integer', start_position)
     return int(m.group()), m.end()
 
 
-_READ_NUMBER_VALUE = re.compile(r"-?(\d*)([.]?\d*)?")
+_READ_NUMBER_VALUE = re.compile(r'-?(\d*)([.]?\d*)?')
 
 
 def read_number(s, start_position):
@@ -315,7 +320,7 @@ def read_number(s, start_position):
     """
     m = _READ_NUMBER_VALUE.match(s, start_position)
     if not m or not (m.group(1) or m.group(2)):
-        raise ReadError("number", start_position)
+        raise ReadError('number', start_position)
     if m.group(2):
         return float(m.group()), m.end()
     else:
@@ -346,16 +351,17 @@ def overridden(method):
 
     :type method: instance method
     """
-    if isinstance(method, types.MethodType) and method.__self__.__class__ is not None:
+    # [xx] breaks on classic classes!
+    if isinstance(method, types.MethodType) and compat.get_im_class(method) is not None:
         name = method.__name__
         funcs = [
             cls.__dict__[name]
-            for cls in _mro(method.__self__.__class__)
+            for cls in _mro(compat.get_im_class(method))
             if name in cls.__dict__
         ]
         return len(funcs) > 1
     else:
-        raise TypeError("Expected an instance method.")
+        raise TypeError('Expected an instance method.')
 
 
 def _mro(cls):
@@ -383,22 +389,22 @@ def _mro(cls):
 
 def _add_epytext_field(obj, field, message):
     """Add an epytext @field to a given object's docstring."""
-    indent = ""
+    indent = ''
     # If we already have a docstring, then add a blank line to separate
     # it from the new field, and check its indentation.
     if obj.__doc__:
-        obj.__doc__ = obj.__doc__.rstrip() + "\n\n"
-        indents = re.findall(r"(?<=\n)[ ]+(?!\s)", obj.__doc__.expandtabs())
+        obj.__doc__ = obj.__doc__.rstrip() + '\n\n'
+        indents = re.findall(r'(?<=\n)[ ]+(?!\s)', obj.__doc__.expandtabs())
         if indents:
             indent = min(indents)
     # If we don't have a docstring, add an empty one.
     else:
-        obj.__doc__ = ""
+        obj.__doc__ = ''
 
     obj.__doc__ += textwrap.fill(
-        "@%s: %s" % (field, message),
+        '@%s: %s' % (field, message),
         initial_indent=indent,
-        subsequent_indent=indent + "    ",
+        subsequent_indent=indent + '    ',
     )
 
 
@@ -416,7 +422,7 @@ def deprecated(message):
 
     def decorator(func):
         msg = "Function %s() has been deprecated.  %s" % (func.__name__, message)
-        msg = "\n" + textwrap.fill(msg, initial_indent="  ", subsequent_indent="  ")
+        msg = '\n' + textwrap.fill(msg, initial_indent='  ', subsequent_indent='  ')
 
         def newFunc(*args, **kwargs):
             warnings.warn(msg, category=DeprecationWarning, stacklevel=2)
@@ -428,7 +434,7 @@ def deprecated(message):
         newFunc.__doc__ = func.__doc__
         newFunc.__deprecated__ = True
         # Add a @deprecated field to the docstring.
-        _add_epytext_field(newFunc, "deprecated", message)
+        _add_epytext_field(newFunc, 'deprecated', message)
         return newFunc
 
     return decorator
@@ -457,22 +463,22 @@ class Deprecated(object):
             if Deprecated in base.__bases__:
                 dep_cls = base
                 break
-        assert dep_cls, "Unable to determine which base is deprecated."
+        assert dep_cls, 'Unable to determine which base is deprecated.'
 
         # Construct an appropriate warning.
-        doc = dep_cls.__doc__ or "".strip()
+        doc = dep_cls.__doc__ or ''.strip()
         # If there's a @deprecated field, strip off the field marker.
-        doc = re.sub(r"\A\s*@deprecated:", r"", doc)
+        doc = re.sub(r'\A\s*@deprecated:', r'', doc)
         # Strip off any indentation.
-        doc = re.sub(r"(?m)^\s*", "", doc)
+        doc = re.sub(r'(?m)^\s*', '', doc)
         # Construct a 'name' string.
-        name = "Class %s" % dep_cls.__name__
+        name = 'Class %s' % dep_cls.__name__
         if cls != dep_cls:
-            name += " (base class for %s)" % cls.__name__
+            name += ' (base class for %s)' % cls.__name__
         # Put it all together.
-        msg = "%s has been deprecated.  %s" % (name, doc)
+        msg = '%s has been deprecated.  %s' % (name, doc)
         # Wrap it.
-        msg = "\n" + textwrap.fill(msg, initial_indent="    ", subsequent_indent="    ")
+        msg = '\n' + textwrap.fill(msg, initial_indent='    ', subsequent_indent='    ')
         warnings.warn(msg, category=DeprecationWarning, stacklevel=2)
         # Do the actual work of __new__.
         return object.__new__(cls)
@@ -521,10 +527,10 @@ def find_file_iter(
     :param verbose: Whether or not to print path when a file is found.
     """
     file_names = [filename] + (file_names or [])
-    assert isinstance(filename, str)
-    assert not isinstance(file_names, str)
-    assert not isinstance(searchpath, str)
-    if isinstance(env_vars, str):
+    assert isinstance(filename, string_types)
+    assert not isinstance(file_names, string_types)
+    assert not isinstance(searchpath, string_types)
+    if isinstance(env_vars, string_types):
         env_vars = env_vars.split()
     yielded = False
 
@@ -533,20 +539,20 @@ def find_file_iter(
         path_to_file = os.path.join(filename, alternative)
         if os.path.isfile(path_to_file):
             if verbose:
-                print("[Found %s: %s]" % (filename, path_to_file))
+                print('[Found %s: %s]' % (filename, path_to_file))
             yielded = True
             yield path_to_file
         # Check the bare alternatives
         if os.path.isfile(alternative):
             if verbose:
-                print("[Found %s: %s]" % (filename, alternative))
+                print('[Found %s: %s]' % (filename, alternative))
             yielded = True
             yield alternative
         # Check if the alternative is inside a 'file' directory
-        path_to_file = os.path.join(filename, "file", alternative)
+        path_to_file = os.path.join(filename, 'file', alternative)
         if os.path.isfile(path_to_file):
             if verbose:
-                print("[Found %s: %s]" % (filename, path_to_file))
+                print('[Found %s: %s]' % (filename, path_to_file))
             yielded = True
             yield path_to_file
 
@@ -561,7 +567,7 @@ def find_file_iter(
                 # Check if the environment variable contains a direct path to the bin
                 if os.path.isfile(env_dir):
                     if verbose:
-                        print("[Found %s: %s]" % (filename, env_dir))
+                        print('[Found %s: %s]' % (filename, env_dir))
                     yielded = True
                     yield env_dir
                 # Check if the possible bin names exist inside the environment variable directories
@@ -569,18 +575,18 @@ def find_file_iter(
                     path_to_file = os.path.join(env_dir, alternative)
                     if os.path.isfile(path_to_file):
                         if verbose:
-                            print("[Found %s: %s]" % (filename, path_to_file))
+                            print('[Found %s: %s]' % (filename, path_to_file))
                         yielded = True
                         yield path_to_file
                     # Check if the alternative is inside a 'file' directory
                     # path_to_file = os.path.join(env_dir, 'file', alternative)
 
                     # Check if the alternative is inside a 'bin' directory
-                    path_to_file = os.path.join(env_dir, "bin", alternative)
+                    path_to_file = os.path.join(env_dir, 'bin', alternative)
 
                     if os.path.isfile(path_to_file):
                         if verbose:
-                            print("[Found %s: %s]" % (filename, path_to_file))
+                            print('[Found %s: %s]' % (filename, path_to_file))
                         yielded = True
                         yield path_to_file
 
@@ -594,11 +600,11 @@ def find_file_iter(
 
     # If we're on a POSIX system, then try using the 'which' command
     # to find the file.
-    if os.name == "posix":
+    if os.name == 'posix':
         for alternative in file_names:
             try:
                 p = subprocess.Popen(
-                    ["which", alternative],
+                    ['which', alternative],
                     stdout=subprocess.PIPE,
                     stderr=subprocess.PIPE,
                 )
@@ -606,7 +612,7 @@ def find_file_iter(
                 path = _decode_stdoutdata(stdout).strip()
                 if path.endswith(alternative) and os.path.exists(path):
                     if verbose:
-                        print("[Found %s: %s]" % (filename, path))
+                        print('[Found %s: %s]' % (filename, path))
                     yielded = True
                     yield path
             except (KeyboardInterrupt, SystemExit, OSError):
@@ -621,15 +627,15 @@ def find_file_iter(
             "configuration paramaters" % filename
         )
         if env_vars:
-            msg += " or set the %s environment variable" % env_vars[0]
-        msg += "."
+            msg += ' or set the %s environment variable' % env_vars[0]
+        msg += '.'
         if searchpath:
-            msg += "\n\n  Searched in:"
-            msg += "".join("\n    - %s" % d for d in searchpath)
+            msg += '\n\n  Searched in:'
+            msg += ''.join('\n    - %s' % d for d in searchpath)
         if url:
-            msg += "\n\n  For more information on %s, see:\n    <%s>" % (filename, url)
-        div = "=" * 75
-        raise LookupError("\n\n%s\n%s\n%s" % (div, msg, div))
+            msg += '\n\n  For more information on %s, see:\n    <%s>' % (filename, url)
+        div = '=' * 75
+        raise LookupError('\n\n%s\n%s\n%s' % (div, msg, div))
 
 
 def find_file(
@@ -713,14 +719,14 @@ def find_jar_iter(
     :param is_regex: Whether name is a regular expression.
     """
 
-    assert isinstance(name_pattern, str)
-    assert not isinstance(searchpath, str)
-    if isinstance(env_vars, str):
+    assert isinstance(name_pattern, string_types)
+    assert not isinstance(searchpath, string_types)
+    if isinstance(env_vars, string_types):
         env_vars = env_vars.split()
     yielded = False
 
     # Make sure we check the CLASSPATH first
-    env_vars = ["CLASSPATH"] + list(env_vars)
+    env_vars = ['CLASSPATH'] + list(env_vars)
 
     # If an explicit location was given, then check it, and yield it if
     # it's present; otherwise, complain.
@@ -730,14 +736,14 @@ def find_jar_iter(
             yield path_to_jar
         else:
             raise LookupError(
-                "Could not find %s jar file at %s" % (name_pattern, path_to_jar)
+                'Could not find %s jar file at %s' % (name_pattern, path_to_jar)
             )
 
     # Check environment variables
     for env_var in env_vars:
         if env_var in os.environ:
-            if env_var == "CLASSPATH":
-                classpath = os.environ["CLASSPATH"]
+            if env_var == 'CLASSPATH':
+                classpath = os.environ['CLASSPATH']
                 for cp in classpath.split(os.path.pathsep):
                     if os.path.isfile(cp):
                         filename = os.path.basename(cp)
@@ -747,7 +753,7 @@ def find_jar_iter(
                             or (not is_regex and filename == name_pattern)
                         ):
                             if verbose:
-                                print("[Found %s: %s]" % (name_pattern, cp))
+                                print('[Found %s: %s]' % (name_pattern, cp))
                             yielded = True
                             yield cp
                     # The case where user put directory containing the jar file in the classpath
@@ -755,7 +761,7 @@ def find_jar_iter(
                         if not is_regex:
                             if os.path.isfile(os.path.join(cp, name_pattern)):
                                 if verbose:
-                                    print("[Found %s: %s]" % (name_pattern, cp))
+                                    print('[Found %s: %s]' % (name_pattern, cp))
                                 yielded = True
                                 yield os.path.join(cp, name_pattern)
                         else:
@@ -764,7 +770,7 @@ def find_jar_iter(
                                 if re.match(name_pattern, file_name):
                                     if verbose:
                                         print(
-                                            "[Found %s: %s]"
+                                            '[Found %s: %s]'
                                             % (
                                                 name_pattern,
                                                 os.path.join(cp, file_name),
@@ -792,7 +798,7 @@ def find_jar_iter(
                             or (not is_regex and filename == name_pattern)
                         ):
                             if verbose:
-                                print("[Found %s: %s]" % (name_pattern, path_to_jar))
+                                print('[Found %s: %s]' % (name_pattern, path_to_jar))
                             yielded = True
                             yield path_to_jar
 
@@ -804,14 +810,14 @@ def find_jar_iter(
                 if os.path.isfile(path_to_jar):
                     if re.match(name_pattern, filename):
                         if verbose:
-                            print("[Found %s: %s]" % (filename, path_to_jar))
+                            print('[Found %s: %s]' % (filename, path_to_jar))
                 yielded = True
                 yield path_to_jar
         else:
             path_to_jar = os.path.join(directory, name_pattern)
             if os.path.isfile(path_to_jar):
                 if verbose:
-                    print("[Found %s: %s]" % (name_pattern, path_to_jar))
+                    print('[Found %s: %s]' % (name_pattern, path_to_jar))
                 yielded = True
                 yield path_to_jar
 
@@ -819,18 +825,18 @@ def find_jar_iter(
         # If nothing was found, raise an error
         msg = "NLTK was unable to find %s!" % name_pattern
         if env_vars:
-            msg += " Set the %s environment variable" % env_vars[0]
-        msg = textwrap.fill(msg + ".", initial_indent="  ", subsequent_indent="  ")
+            msg += ' Set the %s environment variable' % env_vars[0]
+        msg = textwrap.fill(msg + '.', initial_indent='  ', subsequent_indent='  ')
         if searchpath:
-            msg += "\n\n  Searched in:"
-            msg += "".join("\n    - %s" % d for d in searchpath)
+            msg += '\n\n  Searched in:'
+            msg += ''.join('\n    - %s' % d for d in searchpath)
         if url:
-            msg += "\n\n  For more information, on %s, see:\n    <%s>" % (
+            msg += '\n\n  For more information, on %s, see:\n    <%s>' % (
                 name_pattern,
                 url,
             )
-        div = "=" * 75
-        raise LookupError("\n\n%s\n%s\n%s" % (div, msg, div))
+        div = '=' * 75
+        raise LookupError('\n\n%s\n%s\n%s' % (div, msg, div))
 
 
 def find_jar(
@@ -853,7 +859,7 @@ def find_jars_within_path(path_to_jars):
     return [
         os.path.join(root, filename)
         for root, dirnames, filenames in os.walk(path_to_jars)
-        for filename in fnmatch.filter(filenames, "*.jar")
+        for filename in fnmatch.filter(filenames, '*.jar')
     ]
 
 
@@ -884,7 +890,7 @@ def import_from_stdlib(module):
     instead (causing the import to fail).
     """
     old_path = sys.path
-    sys.path = [d for d in sys.path if d not in ("", ".")]
+    sys.path = [d for d in sys.path if d not in ('', '.')]
     m = __import__(module)
     sys.path = old_path
     return m
@@ -895,7 +901,7 @@ def import_from_stdlib(module):
 ##########################################################################
 
 
-
+@compat.python_2_unicode_compatible
 class ElementWrapper(object):
     """
     A wrapper around ElementTree Element objects whose main purpose is
@@ -931,9 +937,9 @@ class ElementWrapper(object):
             <Element "<?xml version='1.0' encoding='utf8'?>\n<test />">
 
         """
-        if isinstance(etree, str):
+        if isinstance(etree, string_types):
             etree = ElementTree.fromstring(etree)
-        self.__dict__["_etree"] = etree
+        self.__dict__['_etree'] = etree
 
     def unwrap(self):
         """
@@ -946,13 +952,13 @@ class ElementWrapper(object):
     ##////////////////////////////////////////////////////////////
 
     def __repr__(self):
-        s = ElementTree.tostring(self._etree, encoding="utf8").decode("utf8")
+        s = ElementTree.tostring(self._etree, encoding='utf8').decode('utf8')
         if len(s) > 60:
-            e = s.rfind("<")
+            e = s.rfind('<')
             if (len(s) - e) > 30:
                 e = -20
-            s = "%s...%s" % (s[:30], s[e:])
-        return "<Element %r>" % s
+            s = '%s...%s' % (s[:30], s[e:])
+        return '<Element %r>' % s
 
     def __str__(self):
         """
@@ -960,7 +966,7 @@ class ElementWrapper(object):
         the wrapped Element object.
         """
         return (
-            ElementTree.tostring(self._etree, encoding="utf8").decode("utf8").rstrip()
+            ElementTree.tostring(self._etree, encoding='utf8').decode('utf8').rstrip()
         )
 
     ##////////////////////////////////////////////////////////////
@@ -1061,7 +1067,7 @@ def slice_bounds(sequence, slice_obj, allow_step=False):
     # Otherwise, make sure that no non-default step value is used.
     elif slice_obj.step not in (None, 1):
         raise ValueError(
-            "slices with steps are not supported by %s" % sequence.__class__.__name__
+            'slices with steps are not supported by %s' % sequence.__class__.__name__
         )
 
     # Supply default offsets.
@@ -1103,7 +1109,7 @@ def is_writable(path):
         return False
 
     # If we're on a posix system, check its permissions.
-    if hasattr(os, "getuid"):
+    if hasattr(os, 'getuid'):
         statdata = os.stat(path)
         perm = stat.S_IMODE(statdata.st_mode)
         # is it world-writable?
index f15fea1..f85f67b 100644 (file)
@@ -1,7 +1,7 @@
 # -*- coding: utf-8 -*-
 # Natural Language Toolkit: JSON Encoder/Decoder Helpers
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Steven Xu <xxu@student.unimelb.edu.au>
 #
 # URL: <http://nltk.org/>
@@ -19,20 +19,20 @@ import json
 
 json_tags = {}
 
-TAG_PREFIX = "!"
+TAG_PREFIX = '!'
 
 
 def register_tag(cls):
     """
     Decorates a class to register it's json tag.
     """
-    json_tags[TAG_PREFIX + getattr(cls, "json_tag")] = cls
+    json_tags[TAG_PREFIX + getattr(cls, 'json_tag')] = cls
     return cls
 
 
 class JSONTaggedEncoder(json.JSONEncoder):
     def default(self, obj):
-        obj_tag = getattr(obj, "json_tag", None)
+        obj_tag = getattr(obj, 'json_tag', None)
         if obj_tag is None:
             return super(JSONTaggedEncoder, self).default(obj)
         obj_tag = TAG_PREFIX + obj_tag
@@ -55,12 +55,12 @@ class JSONTaggedDecoder(json.JSONDecoder):
         if not isinstance(obj, dict) or len(obj) != 1:
             return obj
         obj_tag = next(iter(obj.keys()))
-        if not obj_tag.startswith("!"):
+        if not obj_tag.startswith('!'):
             return obj
         if obj_tag not in json_tags:
-            raise ValueError("Unknown tag", obj_tag)
+            raise ValueError('Unknown tag', obj_tag)
         obj_cls = json_tags[obj_tag]
         return obj_cls.decode_json_obj(obj[obj_tag])
 
 
-__all__ = ["register_tag", "json_tags", "JSONTaggedEncoder", "JSONTaggedDecoder"]
+__all__ = ['register_tag', 'json_tags', 'JSONTaggedEncoder', 'JSONTaggedDecoder']
index 266df76..e51f2c2 100644 (file)
@@ -14,6 +14,7 @@
     See the documentation for further information on copyrights,
     or contact the author. All Rights Reserved.
 """
+from __future__ import print_function
 
 ### Constants
 
@@ -49,7 +50,7 @@ class LazyModule:
     __lazymodule_init = 0
 
     # Name of the module to load
-    __lazymodule_name = ""
+    __lazymodule_name = ''
 
     # Flag which indicates whether the module was loaded or not
     __lazymodule_loaded = 0
@@ -74,9 +75,9 @@ class LazyModule:
         if globals is None:
             globals = locals
         self.__lazymodule_globals = globals
-        mainname = globals.get("__name__", "")
+        mainname = globals.get('__name__', '')
         if mainname:
-            self.__name__ = mainname + "." + name
+            self.__name__ = mainname + '.' + name
             self.__lazymodule_name = name
         else:
             self.__name__ = self.__lazymodule_name = name
@@ -91,9 +92,9 @@ class LazyModule:
         if self.__lazymodule_loaded:
             return self.__lazymodule_locals[name]
         if _debug:
-            print("LazyModule: Loading module %r" % name)
+            print('LazyModule: Loading module %r' % name)
         self.__lazymodule_locals[name] = module = __import__(
-            name, self.__lazymodule_locals, self.__lazymodule_globals, "*"
+            name, self.__lazymodule_locals, self.__lazymodule_globals, '*'
         )
 
         # Fill namespace with all symbols from original module to
@@ -101,10 +102,10 @@ class LazyModule:
         self.__dict__.update(module.__dict__)
 
         # Set import flag
-        self.__dict__["__lazymodule_loaded"] = 1
+        self.__dict__['__lazymodule_loaded'] = 1
 
         if _debug:
-            print("LazyModule: Module %r loaded" % name)
+            print('LazyModule: Module %r loaded' % name)
         return module
 
     def __getattr__(self, name):
@@ -115,8 +116,8 @@ class LazyModule:
             raise AttributeError(name)
         if _debug:
             print(
-                "LazyModule: "
-                "Module load triggered by attribute %r read access" % name
+                'LazyModule: '
+                'Module load triggered by attribute %r read access' % name
             )
         module = self.__lazymodule_import()
         return getattr(module, name)
@@ -134,8 +135,8 @@ class LazyModule:
             return
         if _debug:
             print(
-                "LazyModule: "
-                "Module load triggered by attribute %r write access" % name
+                'LazyModule: '
+                'Module load triggered by attribute %r write access' % name
             )
         module = self.__lazymodule_import()
         setattr(module, name, value)
index 5b87ef7..b76799b 100644 (file)
@@ -1,6 +1,7 @@
+# -*- coding: utf-8 -*-
 # Natural Language Toolkit: Language Models
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Authors: Ilia Kurenkov <ilia.kurenkov@gmail.com>
 # URL: <http://nltk.org/
 # For license information, see LICENSE.TXT
@@ -201,7 +202,7 @@ One cool feature of ngram models is that they can be used to generate text.
     >>> lm.generate(1, random_seed=3)
     '<s>'
     >>> lm.generate(5, random_seed=3)
-    ['<s>', 'a', 'b', 'c', 'd']
+    ['<s>', 'a', 'b', 'c', '</s>']
 
 Provide `random_seed` if you want to consistently reproduce the same text all
 other things being equal. Here we are using it to test the examples.
@@ -210,7 +211,7 @@ You can also condition your generation on some preceding text with the `context`
 argument.
 
     >>> lm.generate(5, text_seed=['c'], random_seed=3)
-    ['</s>', 'c', 'd', 'c', 'd']
+    ['</s>', '<s>', 'a', 'b', 'c']
 
 Note that an ngram model is restricted in how much preceding context it can
 take into account. For example, a trigram model can only condition its output
index 5f3ebc2..2b68b77 100644 (file)
Binary files a/nlp_resource_data/nltk/lm/__pycache__/__init__.cpython-37.pyc and b/nlp_resource_data/nltk/lm/__pycache__/__init__.cpython-37.pyc differ
index f9c0a39..27ae6cc 100644 (file)
Binary files a/nlp_resource_data/nltk/lm/__pycache__/api.cpython-37.pyc and b/nlp_resource_data/nltk/lm/__pycache__/api.cpython-37.pyc differ
index d7a2490..4475148 100644 (file)
Binary files a/nlp_resource_data/nltk/lm/__pycache__/counter.cpython-37.pyc and b/nlp_resource_data/nltk/lm/__pycache__/counter.cpython-37.pyc differ
index 2c5d88e..ae62dd5 100644 (file)
Binary files a/nlp_resource_data/nltk/lm/__pycache__/models.cpython-37.pyc and b/nlp_resource_data/nltk/lm/__pycache__/models.cpython-37.pyc differ
index e7c7319..8f9e7f4 100644 (file)
Binary files a/nlp_resource_data/nltk/lm/__pycache__/preprocessing.cpython-37.pyc and b/nlp_resource_data/nltk/lm/__pycache__/preprocessing.cpython-37.pyc differ
index 0918ee6..c7dce44 100644 (file)
Binary files a/nlp_resource_data/nltk/lm/__pycache__/smoothing.cpython-37.pyc and b/nlp_resource_data/nltk/lm/__pycache__/smoothing.cpython-37.pyc differ
index a4ab715..bdbe660 100644 (file)
Binary files a/nlp_resource_data/nltk/lm/__pycache__/util.cpython-37.pyc and b/nlp_resource_data/nltk/lm/__pycache__/util.cpython-37.pyc differ
index edc8fd1..91e69c7 100644 (file)
Binary files a/nlp_resource_data/nltk/lm/__pycache__/vocabulary.cpython-37.pyc and b/nlp_resource_data/nltk/lm/__pycache__/vocabulary.cpython-37.pyc differ
index 60e9f87..e2ea244 100644 (file)
@@ -1,38 +1,55 @@
+# -*- coding: utf-8 -*-
 # Natural Language Toolkit: Language Models
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Authors: Ilia Kurenkov <ilia.kurenkov@gmail.com>
 # URL: <http://nltk.org/>
 # For license information, see LICENSE.TXT
 """Language Model Interface."""
+from __future__ import division, unicode_literals
 
 import random
 from abc import ABCMeta, abstractmethod
 from bisect import bisect
 
+from six import add_metaclass
 
 from nltk.lm.counter import NgramCounter
 from nltk.lm.util import log_base2
 from nltk.lm.vocabulary import Vocabulary
 
-from itertools import accumulate
-
-
-class Smoothing(metaclass=ABCMeta):
+try:
+    from itertools import accumulate
+except ImportError:
+    import operator
+
+    def accumulate(iterable, func=operator.add):
+        """Return running totals"""
+        # accumulate([1,2,3,4,5]) --> 1 3 6 10 15
+        # accumulate([1,2,3,4,5], operator.mul) --> 1 2 6 24 120
+        it = iter(iterable)
+        try:
+            total = next(it)
+        except StopIteration:
+            return
+        yield total
+        for element in it:
+            total = func(total, element)
+            yield total
+
+
+@add_metaclass(ABCMeta)
+class Smoothing(object):
     """Ngram Smoothing Interface
 
     Implements Chen & Goodman 1995's idea that all smoothing algorithms have
-    certain features in common. This should ideally allow smoothing algorithms to
+    certain features in common. This should ideally allow smoothing algoritms to
     work both with Backoff and Interpolation.
+
+    counter represents the number of counts for ngrams
     """
 
     def __init__(self, vocabulary, counter):
-        """
-        :param vocabulary: The Ngram vocabulary object.
-        :type vocabulary: nltk.lm.vocab.Vocabulary
-        :param counter: The counts of the vocabulary items.
-        :type counter: nltk.lm.counter.NgramCounter
-        """
         self.vocab = vocabulary
         self.counts = counter
 
@@ -56,7 +73,7 @@ def _random_generator(seed_or_generator):
     return random.Random(seed_or_generator)
 
 
-def _weighted_choice(population, weights, random_generator=None):
+def _weighted_choice(population, weights, random_seed=None):
     """Like random.choice, but with weights.
 
     Heavily inspired by python 3.6 `random.choices`.
@@ -67,11 +84,12 @@ def _weighted_choice(population, weights, random_generator=None):
         raise ValueError("The number of weights does not match the population")
     cum_weights = list(accumulate(weights))
     total = cum_weights[-1]
-    threshold = random_generator.random()
+    threshold = _random_generator(random_seed).random()
     return population[bisect(cum_weights, total * threshold)]
 
 
-class LanguageModel(metaclass=ABCMeta):
+@add_metaclass(ABCMeta)
+class LanguageModel(object):
     """ABC for Language Models.
 
     Cannot be directly instantiated itself.
@@ -106,7 +124,7 @@ class LanguageModel(metaclass=ABCMeta):
         if not self.vocab:
             if vocabulary_text is None:
                 raise ValueError(
-                    "Cannot fit without a vocabulary or text to create it from."
+                    "Cannot fit without a vocabulary or text to " "create it from."
                 )
             self.vocab.update(vocabulary_text)
         self.counts.update(self.vocab.lookup(sent) for sent in text)
@@ -181,8 +199,8 @@ class LanguageModel(metaclass=ABCMeta):
 
         :param int num_words: How many words to generate. By default 1.
         :param text_seed: Generation can be conditioned on preceding context.
-        :param random_seed: A random seed or an instance of `random.Random`. If provided,
-        makes the random sampling part of generation reproducible.
+        :param random_seed: If provided, makes the random sampling part of
+        generation reproducible.
         :return: One (str) word or a list of words generated from model.
 
         Examples:
@@ -198,8 +216,7 @@ class LanguageModel(metaclass=ABCMeta):
 
         """
         text_seed = [] if text_seed is None else list(text_seed)
-        random_generator = _random_generator(random_seed)
-        # This is the base recursion case.
+        # base recursion case
         if num_words == 1:
             context = (
                 text_seed[-self.order + 1 :]
@@ -210,23 +227,21 @@ class LanguageModel(metaclass=ABCMeta):
             while context and not samples:
                 context = context[1:] if len(context) > 1 else []
                 samples = self.context_counts(self.vocab.lookup(context))
-            # Sorting samples achieves two things:
+            # sorting achieves two things:
             # - reproducible randomness when sampling
-            # - turns Mapping into Sequence which `_weighted_choice` expects
+            # - turning Mapping into Sequence which _weighted_choice expects
             samples = sorted(samples)
             return _weighted_choice(
-                samples,
-                tuple(self.score(w, context) for w in samples),
-                random_generator,
+                samples, tuple(self.score(w, context) for w in samples), random_seed
             )
-        # We build up text one word at a time using the preceding context.
+        # build up text one word at a time
         generated = []
         for _ in range(num_words):
             generated.append(
                 self.generate(
                     num_words=1,
                     text_seed=text_seed + generated,
-                    random_seed=random_generator,
+                    random_seed=random_seed,
                 )
             )
         return generated
index 09be9b4..1ceaa42 100644 (file)
@@ -1,6 +1,7 @@
+# -*- coding: utf-8 -*-
 # Natural Language Toolkit
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Ilia Kurenkov <ilia.kurenkov@gmail.com>
 # URL: <http://nltk.org/>
 # For license information, see LICENSE.TXT
@@ -9,13 +10,17 @@ Language Model Counter
 ----------------------
 """
 
-from collections import defaultdict
-from collections.abc import Sequence
+from __future__ import unicode_literals
 
+from collections import Sequence, defaultdict
+
+from six import string_types
+from nltk import compat
 from nltk.probability import ConditionalFreqDist, FreqDist
 
 
-class NgramCounter:
+@compat.python_2_unicode_compatible
+class NgramCounter(object):
     """Class for counting ngrams.
 
     Will count any ngram sequence you give it ;)
@@ -146,7 +151,7 @@ class NgramCounter:
         """User-friendly access to ngram counts."""
         if isinstance(item, int):
             return self._counts[item]
-        elif isinstance(item, str):
+        elif isinstance(item, string_types):
             return self._counts.__getitem__(1)[item]
         elif isinstance(item, Sequence):
             return self._counts.__getitem__(len(item) + 1)[tuple(item)]
index f459af3..639a1ce 100644 (file)
@@ -1,15 +1,19 @@
+# -*- coding: utf-8 -*-
 # Natural Language Toolkit: Language Models
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Ilia Kurenkov <ilia.kurenkov@gmail.com>
 # URL: <http://nltk.org/>
 # For license information, see LICENSE.TXT
 """Language Models"""
+from __future__ import division, unicode_literals
 
+from nltk import compat
 from nltk.lm.api import LanguageModel, Smoothing
 from nltk.lm.smoothing import KneserNey, WittenBell
 
 
+@compat.python_2_unicode_compatible
 class MLE(LanguageModel):
     """Class for providing MLE ngram model scores.
 
@@ -26,6 +30,7 @@ class MLE(LanguageModel):
         return self.context_counts(context).freq(word)
 
 
+@compat.python_2_unicode_compatible
 class Lidstone(LanguageModel):
     """Provides Lidstone-smoothed scores.
 
@@ -34,7 +39,7 @@ class Lidstone(LanguageModel):
     """
 
     def __init__(self, gamma, *args, **kwargs):
-        super().__init__(*args, **kwargs)
+        super(Lidstone, self).__init__(*args, **kwargs)
         self.gamma = gamma
 
     def unmasked_score(self, word, context=None):
@@ -49,6 +54,7 @@ class Lidstone(LanguageModel):
         return (word_count + self.gamma) / (norm_count + len(self.vocab) * self.gamma)
 
 
+@compat.python_2_unicode_compatible
 class Laplace(Lidstone):
     """Implements Laplace (add one) smoothing.
 
@@ -56,31 +62,24 @@ class Laplace(Lidstone):
     """
 
     def __init__(self, *args, **kwargs):
-        super().__init__(1, *args, **kwargs)
+        super(Laplace, self).__init__(1, *args, **kwargs)
 
 
 class InterpolatedLanguageModel(LanguageModel):
     """Logic common to all interpolated language models.
 
     The idea to abstract this comes from Chen & Goodman 1995.
-    Do not instantiate this class directly!
     """
 
     def __init__(self, smoothing_cls, order, **kwargs):
         assert issubclass(smoothing_cls, Smoothing)
         params = kwargs.pop("params", {})
-        super().__init__(order, **kwargs)
+        super(InterpolatedLanguageModel, self).__init__(order, **kwargs)
         self.estimator = smoothing_cls(self.vocab, self.counts, **params)
 
     def unmasked_score(self, word, context=None):
         if not context:
-            # The base recursion case: no context, we only have a unigram.
             return self.estimator.unigram_score(word)
-        if not self.counts[context]:
-            # It can also happen that we have no data for this context.
-            # In that case we defer to the lower-order ngram.
-            # This is the same as setting alpha to 0 and gamma to 1.
-            return self.unmasked_score(word, context[1:])
         alpha, gamma = self.estimator.alpha_gamma(word, context)
         return alpha + gamma * self.unmasked_score(word, context[1:])
 
@@ -89,11 +88,13 @@ class WittenBellInterpolated(InterpolatedLanguageModel):
     """Interpolated version of Witten-Bell smoothing."""
 
     def __init__(self, order, **kwargs):
-        super().__init__(WittenBell, order, **kwargs)
+        super(WittenBellInterpolated, self).__init__(WittenBell, order, **kwargs)
 
 
 class KneserNeyInterpolated(InterpolatedLanguageModel):
     """Interpolated version of Kneser-Ney smoothing."""
 
     def __init__(self, order, discount=0.1, **kwargs):
-        super().__init__(KneserNey, order, params={"discount": discount}, **kwargs)
+        super(KneserNeyInterpolated, self).__init__(
+            KneserNey, order, params={"discount": discount}, **kwargs
+        )
index 8279cd4..c9d695a 100644 (file)
@@ -1,6 +1,7 @@
+# -*- coding: utf-8 -*-
 # Natural Language Toolkit: Language Model Unit Tests
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Ilia Kurenkov <ilia.kurenkov@gmail.com>
 # URL: <http://nltk.org/>
 # For license information, see LICENSE.TXT
index 02b8df2..693e2da 100644 (file)
@@ -1,6 +1,7 @@
+# -*- coding: utf-8 -*-
 # Natural Language Toolkit: Language Model Unit Tests
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Ilia Kurenkov <ilia.kurenkov@gmail.com>
 # URL: <http://nltk.org/>
 # For license information, see LICENSE.TXT
@@ -20,37 +21,58 @@ def _count_non_zero_vals(dictionary):
 class WittenBell(Smoothing):
     """Witten-Bell smoothing."""
 
-    def __init__(self, vocabulary, counter, **kwargs):
-        super().__init__(vocabulary, counter, **kwargs)
+    def __init__(self, vocabulary, counter, discount=0.1, **kwargs):
+        super(WittenBell, self).__init__(vocabulary, counter, *kwargs)
+        self.counts = counter
 
     def alpha_gamma(self, word, context):
-        alpha = self.counts[context].freq(word)
-        gamma = self._gamma(context)
-        return (1.0 - gamma) * alpha, gamma
-
-    def _gamma(self, context):
-        n_plus = _count_non_zero_vals(self.counts[context])
-        return n_plus / (n_plus + self.counts[len(context) + 1].N())
+        gamma = self.gamma(context)
+        return (1.0 - gamma) * self.alpha(word, context), gamma
 
     def unigram_score(self, word):
         return self.counts.unigrams.freq(word)
 
+    def alpha(self, word, context):
+        return self.counts[context].freq(word)
+
+    def gamma(self, context):
+        n_plus = _count_non_zero_vals(self.counts[context])
+        return n_plus / (n_plus + self.counts[len(context) + 1].N())
+
 
 class KneserNey(Smoothing):
     """Kneser-Ney Smoothing."""
 
     def __init__(self, vocabulary, counter, discount=0.1, **kwargs):
-        super().__init__(vocabulary, counter, **kwargs)
+        super(KneserNey, self).__init__(vocabulary, counter, *kwargs)
         self.discount = discount
+        self.vocabulary = vocabulary
 
     def unigram_score(self, word):
-        return 1.0 / len(self.vocab)
+        return 1.0 / len(self.vocabulary)
 
     def alpha_gamma(self, word, context):
         prefix_counts = self.counts[context]
-        prefix_total_ngrams = prefix_counts.N()
-        alpha = max(prefix_counts[word] - self.discount, 0.0) / prefix_total_ngrams
-        gamma = (
-            self.discount * _count_non_zero_vals(prefix_counts) / prefix_total_ngrams
-        )
-        return alpha, gamma
+        return self.alpha(word, prefix_counts), self.gamma(prefix_counts)
+
+    def alpha(self, word, prefix_counts):
+        return max(prefix_counts[word] - self.discount, 0.0) / prefix_counts.N()
+
+    def gamma(self, prefix_counts):
+        return self.discount * _count_non_zero_vals(prefix_counts) / prefix_counts.N()
+
+
+class GoodTuring(Smoothing):
+    """Good-Turing Smoothing"""
+    def __init__(self, vocabulary, counter, **kwargs):
+        super(GoodTuring, self).__init__(vocabulary, counter, *kwargs)
+        self.counts = counter
+        self.vocabulary = vocabulary
+
+    def unigram_score(self, word):
+        word_count = self.counts[word]
+        count_plus_1 = 0.
+        for everyContext in self.counts.keys():
+            if len(everyContext.split()) == word_count+1:
+                count_plus_1 += 1
+        return count_plus_1 / len(self.vocabulary)
index 0da2eb2..62457e3 100644 (file)
@@ -1,6 +1,7 @@
+# -*- coding: utf-8 -*-
 # Natural Language Toolkit
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Ilia Kurenkov <ilia.kurenkov@gmail.com>
 # URL: <http://nltk.org/>
 # For license information, see LICENSE.TXT
index 39ba6b2..3c7439d 100644 (file)
@@ -1,16 +1,26 @@
+# -*- coding: utf-8 -*-
 # Natural Language Toolkit
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Ilia Kurenkov <ilia.kurenkov@gmail.com>
 # URL: <http://nltk.org/>
 # For license information, see LICENSE.TXT
 """Language Model Vocabulary"""
 
+from __future__ import unicode_literals
+
 import sys
-from collections import Counter
-from collections.abc import Iterable
+from collections import Counter, Iterable
 from itertools import chain
-from functools import singledispatch
+
+from nltk import compat
+
+try:
+    # Python >= 3.4
+    from functools import singledispatch
+except ImportError:
+    # Python < 3.4
+    from singledispatch import singledispatch
 
 
 @singledispatch
@@ -30,13 +40,22 @@ def _(words, vocab):
     return tuple(_dispatched_lookup(w, vocab) for w in words)
 
 
-@_dispatched_lookup.register(str)
+try:
+    # Python 2 unicode + str type
+    basestring
+except NameError:
+    # Python 3 unicode + str type
+    basestring = str
+
+
+@_dispatched_lookup.register(basestring)
 def _string_lookup(word, vocab):
     """Looks up one word in the vocabulary."""
     return word if word in vocab else vocab.unk_label
 
 
-class Vocabulary:
+@compat.python_2_unicode_compatible
+class Vocabulary(object):
     """Stores language model vocabulary.
 
     Satisfies two common language modeling requirements for a vocabulary:
@@ -116,7 +135,7 @@ class Vocabulary:
     ('<UNK>', 'a', '<UNK>', 'd', '<UNK>', 'c')
 
     It's possible to update the counts after the vocabulary has been created.
-    In general, the interface is the same as that of `collections.Counter`.
+    The interface follows that of `collections.Counter`.
 
     >>> vocab['b']
     1
@@ -217,6 +236,12 @@ class Vocabulary:
             and self.counts == other.counts
         )
 
+    if sys.version_info[0] == 2:
+        # see https://stackoverflow.com/a/35781654/4501212
+        def __ne__(self, other):
+            equal = self.__eq__(other)
+            return equal if equal is NotImplemented else not equal
+
     def __str__(self):
         return "<{0} with cutoff={1} unk_label='{2}' and {3} items>".format(
             self.__class__.__name__, self.cutoff, self.unk_label, len(self)
index 59be7df..227fda4 100644 (file)
@@ -1,6 +1,6 @@
 # Natural Language Toolkit: Metrics
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Steven Bird <stevenbird1@gmail.com>
 #         Edward Loper <edloper@gmail.com>
 # URL: <http://nltk.org/>
@@ -24,7 +24,6 @@ from nltk.metrics.scores import (
 from nltk.metrics.confusionmatrix import ConfusionMatrix
 from nltk.metrics.distance import (
     edit_distance,
-    edit_distance_align,
     binary_distance,
     jaccard_distance,
     masi_distance,
@@ -40,7 +39,6 @@ from nltk.metrics.association import (
     NgramAssocMeasures,
     BigramAssocMeasures,
     TrigramAssocMeasures,
-    QuadgramAssocMeasures,
     ContingencyMeasures,
 )
 from nltk.metrics.spearman import (
index b0a9b1a..c0fcc7c 100644 (file)
Binary files a/nlp_resource_data/nltk/metrics/__pycache__/__init__.cpython-37.pyc and b/nlp_resource_data/nltk/metrics/__pycache__/__init__.cpython-37.pyc differ
index 152e55e..b00991b 100644 (file)
Binary files a/nlp_resource_data/nltk/metrics/__pycache__/agreement.cpython-37.pyc and b/nlp_resource_data/nltk/metrics/__pycache__/agreement.cpython-37.pyc differ
index a3fe392..9076b3d 100644 (file)
Binary files a/nlp_resource_data/nltk/metrics/__pycache__/aline.cpython-37.pyc and b/nlp_resource_data/nltk/metrics/__pycache__/aline.cpython-37.pyc differ
index d668cc3..049c745 100644 (file)
Binary files a/nlp_resource_data/nltk/metrics/__pycache__/association.cpython-37.pyc and b/nlp_resource_data/nltk/metrics/__pycache__/association.cpython-37.pyc differ
index 5f05b53..5e5972e 100644 (file)
Binary files a/nlp_resource_data/nltk/metrics/__pycache__/confusionmatrix.cpython-37.pyc and b/nlp_resource_data/nltk/metrics/__pycache__/confusionmatrix.cpython-37.pyc differ
index addf2e9..e32f05a 100644 (file)
Binary files a/nlp_resource_data/nltk/metrics/__pycache__/distance.cpython-37.pyc and b/nlp_resource_data/nltk/metrics/__pycache__/distance.cpython-37.pyc differ
index 111907f..914b15b 100644 (file)
Binary files a/nlp_resource_data/nltk/metrics/__pycache__/paice.cpython-37.pyc and b/nlp_resource_data/nltk/metrics/__pycache__/paice.cpython-37.pyc differ
index 3a17b16..594541d 100644 (file)
Binary files a/nlp_resource_data/nltk/metrics/__pycache__/scores.cpython-37.pyc and b/nlp_resource_data/nltk/metrics/__pycache__/scores.cpython-37.pyc differ
index c8ae7b7..479d2ef 100644 (file)
Binary files a/nlp_resource_data/nltk/metrics/__pycache__/segmentation.cpython-37.pyc and b/nlp_resource_data/nltk/metrics/__pycache__/segmentation.cpython-37.pyc differ
index 608ae3d..d125645 100644 (file)
Binary files a/nlp_resource_data/nltk/metrics/__pycache__/spearman.cpython-37.pyc and b/nlp_resource_data/nltk/metrics/__pycache__/spearman.cpython-37.pyc differ
index 8fff0eb..445a87f 100644 (file)
@@ -1,6 +1,6 @@
 # Natural Language Toolkit: Agreement Metrics
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Tom Lippincott <tom@cs.columbia.edu>
 # URL: <http://nltk.org/>
 # For license information, see LICENSE.TXT
@@ -68,19 +68,24 @@ Expected results from the Artstein and Poesio survey paper:
     1.0
 
 """
+from __future__ import print_function, unicode_literals, division
 
 import logging
 from itertools import groupby
 from operator import itemgetter
 
+from six import iteritems
+
 from nltk.probability import FreqDist, ConditionalFreqDist
 from nltk.internals import deprecated
+from nltk.compat import python_2_unicode_compatible
 
 from nltk.metrics.distance import binary_distance
 
 log = logging.getLogger(__name__)
 
 
+@python_2_unicode_compatible
 class AnnotationTask(object):
     """Represents an annotation task, i.e. people assign labels to items.
 
@@ -117,7 +122,7 @@ class AnnotationTask(object):
         return "\r\n".join(
             map(
                 lambda x: "%s\t%s\t%s"
-                % (x["coder"], x["item"].replace("_", "\t"), ",".join(x["labels"])),
+                % (x['coder'], x['item'].replace('_', "\t"), ",".join(x['labels'])),
                 self.data,
             )
         )
@@ -132,7 +137,7 @@ class AnnotationTask(object):
             self.C.add(coder)
             self.K.add(labels)
             self.I.add(item)
-            self.data.append({"coder": coder, "labels": labels, "item": item})
+            self.data.append({'coder': coder, 'labels': labels, 'item': item})
 
     def agr(self, cA, cB, i, data=None):
         """Agreement between two coders on a given item
@@ -142,29 +147,32 @@ class AnnotationTask(object):
         # cfedermann: we don't know what combination of coder/item will come
         # first in x; to avoid StopIteration problems due to assuming an order
         # cA,cB, we allow either for k1 and then look up the missing as k2.
-        k1 = next((x for x in data if x["coder"] in (cA, cB) and x["item"] == i))
-        if k1["coder"] == cA:
-            k2 = next((x for x in data if x["coder"] == cB and x["item"] == i))
+        k1 = next((x for x in data if x['coder'] in (cA, cB) and x['item'] == i))
+        if k1['coder'] == cA:
+            k2 = next((x for x in data if x['coder'] == cB and x['item'] == i))
         else:
-            k2 = next((x for x in data if x["coder"] == cA and x["item"] == i))
+            k2 = next((x for x in data if x['coder'] == cA and x['item'] == i))
 
-        ret = 1.0 - float(self.distance(k1["labels"], k2["labels"]))
+        ret = 1.0 - float(self.distance(k1['labels'], k2['labels']))
         log.debug("Observed agreement between %s and %s on %s: %f", cA, cB, i, ret)
         log.debug(
-            'Distance between "%r" and "%r": %f', k1["labels"], k2["labels"], 1.0 - ret
+            "Distance between \"%r\" and \"%r\": %f",
+            k1['labels'],
+            k2['labels'],
+            1.0 - ret,
         )
         return ret
 
     def Nk(self, k):
-        return float(sum(1 for x in self.data if x["labels"] == k))
+        return float(sum(1 for x in self.data if x['labels'] == k))
 
     def Nik(self, i, k):
-        return float(sum(1 for x in self.data if x["item"] == i and x["labels"] == k))
+        return float(sum(1 for x in self.data if x['item'] == i and x['labels'] == k))
 
     def Nck(self, c, k):
-        return float(sum(1 for x in self.data if x["coder"] == c and x["labels"] == k))
+        return float(sum(1 for x in self.data if x['coder'] == c and x['labels'] == k))
 
-    @deprecated("Use Nk, Nik or Nck instead")
+    @deprecated('Use Nk, Nik or Nck instead')
     def N(self, k=None, i=None, c=None):
         """Implements the "n-notation" used in Artstein and Poesio (2007)
 
@@ -191,7 +199,7 @@ class AnnotationTask(object):
 
         """
         data = self._grouped_data(
-            "item", (x for x in self.data if x["coder"] in (cA, cB))
+            'item', (x for x in self.data if x['coder'] in (cA, cB))
         )
         ret = sum(self.agr(cA, cB, item, item_data) for item, item_data in data) / len(
             self.I
@@ -227,10 +235,10 @@ class AnnotationTask(object):
 
         """
         total = 0.0
-        data = (x for x in self.data if x["coder"] in (cA, cB))
-        for i, itemdata in self._grouped_data("item", data):
+        data = (x for x in self.data if x['coder'] in (cA, cB))
+        for i, itemdata in self._grouped_data('item', data):
             # we should have two items; distance doesn't care which comes first
-            total += self.distance(next(itemdata)["labels"], next(itemdata)["labels"])
+            total += self.distance(next(itemdata)['labels'], next(itemdata)['labels'])
 
         ret = total / (len(self.I) * max_distance)
         log.debug("Observed disagreement between %s and %s: %f", cA, cB, ret)
@@ -261,8 +269,8 @@ class AnnotationTask(object):
 
         """
         total = 0.0
-        label_freqs = FreqDist(x["labels"] for x in self.data)
-        for k, f in label_freqs.items():
+        label_freqs = FreqDist(x['labels'] for x in self.data)
+        for k, f in iteritems(label_freqs):
             total += f ** 2
         Ae = total / ((len(self.I) * len(self.C)) ** 2)
         return (self.avg_Ao() - Ae) / (1 - Ae)
@@ -270,7 +278,7 @@ class AnnotationTask(object):
     def Ae_kappa(self, cA, cB):
         Ae = 0.0
         nitems = float(len(self.I))
-        label_freqs = ConditionalFreqDist((x["labels"], x["coder"]) for x in self.data)
+        label_freqs = ConditionalFreqDist((x['labels'], x['coder']) for x in self.data)
         for k in label_freqs.conditions():
             Ae += (label_freqs[k][cA] / nitems) * (label_freqs[k][cB] / nitems)
         return Ae
@@ -302,8 +310,8 @@ class AnnotationTask(object):
     def Disagreement(self, label_freqs):
         total_labels = sum(label_freqs.values())
         pairs = 0.0
-        for j, nj in label_freqs.items():
-            for l, nl in label_freqs.items():
+        for j, nj in iteritems(label_freqs):
+            for l, nl in iteritems(label_freqs):
                 pairs += float(nj * nl) * self.distance(l, j)
         return 1.0 * pairs / (total_labels * (total_labels - 1))
 
@@ -324,9 +332,9 @@ class AnnotationTask(object):
         total_ratings = 0
         all_valid_labels_freq = FreqDist([])
 
-        total_do = 0.0  # Total observed disagreement for all items.
-        for i, itemdata in self._grouped_data("item"):
-            label_freqs = FreqDist(x["labels"] for x in itemdata)
+        total_do = 0.0 # Total observed disagreement for all items.
+        for i, itemdata in self._grouped_data('item'):
+            label_freqs = FreqDist(x['labels'] for x in itemdata)
             labels_count = sum(label_freqs.values())
             if labels_count < 2:
                 # Ignore the item.
@@ -336,7 +344,7 @@ class AnnotationTask(object):
 
         do = total_do / sum(all_valid_labels_freq.values())
 
-        de = self.Disagreement(all_valid_labels_freq)  # Expected disagreement.
+        de = self.Disagreement(all_valid_labels_freq) # Expected disagreement.
         k_alpha = 1.0 - do / de
 
         return k_alpha
@@ -347,7 +355,7 @@ class AnnotationTask(object):
         """
         total = 0.0
         label_freqs = ConditionalFreqDist(
-            (x["coder"], x["labels"]) for x in self.data if x["coder"] in (cA, cB)
+            (x['coder'], x['labels']) for x in self.data if x['coder'] in (cA, cB)
         )
         for j in self.K:
             for l in self.K:
@@ -367,7 +375,7 @@ class AnnotationTask(object):
         )
 
 
-if __name__ == "__main__":
+if __name__ == '__main__':
 
     import re
     import optparse
@@ -415,7 +423,7 @@ if __name__ == "__main__":
         "-v",
         "--verbose",
         dest="verbose",
-        default="0",
+        default='0',
         help="how much debugging to print on stderr (0-4)",
     )
     parser.add_option(
@@ -457,7 +465,7 @@ if __name__ == "__main__":
 
     # read in data from the specified file
     data = []
-    with open(options.file, "r") as infile:
+    with open(options.file, 'r') as infile:
         for l in infile:
             toks = l.split(options.columnsep)
             coder, object_, labels = (
index 4b88bb4..3b8cba5 100644 (file)
@@ -1,7 +1,7 @@
 # -*- coding: utf-8 -*-
 # Natural Language Toolkit: ALINE
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Greg Kondrak <gkondrak@ualberta.ca>
 #         Geoff Bacon <bacon@berkeley.edu> (Python port)
 # URL: <http://nltk.org/>
@@ -38,6 +38,8 @@ Example usage
 University of Toronto.
 """
 
+from __future__ import unicode_literals
+
 try:
     import numpy as np
 except ImportError:
@@ -45,7 +47,7 @@ except ImportError:
 
 # === Constants ===
 
-inf = float("inf")
+inf = float('inf')
 
 # Default values for maximum similarity scores (Kondrak 2002: 54)
 C_skip = 10  # Indels
@@ -54,1022 +56,1022 @@ C_exp = 45  # Expansions/compressions
 C_vwl = 5  # Vowel/consonant relative weight (decreased from 10)
 
 consonants = [
-    "B",
-    "N",
-    "R",
-    "b",
-    "c",
-    "d",
-    "f",
-    "g",
-    "h",
-    "j",
-    "k",
-    "l",
-    "m",
-    "n",
-    "p",
-    "q",
-    "r",
-    "s",
-    "t",
-    "v",
-    "x",
-    "z",
-    "ç",
-    "ð",
-    "ħ",
-    "ŋ",
-    "ɖ",
-    "ɟ",
-    "ɢ",
-    "ɣ",
-    "ɦ",
-    "ɬ",
-    "ɮ",
-    "ɰ",
-    "ɱ",
-    "ɲ",
-    "ɳ",
-    "ɴ",
-    "ɸ",
-    "ɹ",
-    "ɻ",
-    "ɽ",
-    "ɾ",
-    "ʀ",
-    "ʁ",
-    "ʂ",
-    "ʃ",
-    "ʈ",
-    "ʋ",
-    "ʐ ",
-    "ʒ",
-    "ʔ",
-    "ʕ",
-    "ʙ",
-    "ʝ",
-    "β",
-    "θ",
-    "χ",
-    "ʐ",
-    "w",
+    'B',
+    'N',
+    'R',
+    'b',
+    'c',
+    'd',
+    'f',
+    'g',
+    'h',
+    'j',
+    'k',
+    'l',
+    'm',
+    'n',
+    'p',
+    'q',
+    'r',
+    's',
+    't',
+    'v',
+    'x',
+    'z',
+    'ç',
+    'ð',
+    'ħ',
+    'ŋ',
+    'ɖ',
+    'ɟ',
+    'ɢ',
+    'ɣ',
+    'ɦ',
+    'ɬ',
+    'ɮ',
+    'ɰ',
+    'ɱ',
+    'ɲ',
+    'ɳ',
+    'ɴ',
+    'ɸ',
+    'ɹ',
+    'ɻ',
+    'ɽ',
+    'ɾ',
+    'ʀ',
+    'ʁ',
+    'ʂ',
+    'ʃ',
+    'ʈ',
+    'ʋ',
+    'ʐ ',
+    'ʒ',
+    'ʔ',
+    'ʕ',
+    'ʙ',
+    'ʝ',
+    'β',
+    'θ',
+    'χ',
+    'ʐ',
+    'w',
 ]
 
 # Relevant features for comparing consonants and vowels
 R_c = [
-    "aspirated",
-    "lateral",
-    "manner",
-    "nasal",
-    "place",
-    "retroflex",
-    "syllabic",
-    "voice",
+    'aspirated',
+    'lateral',
+    'manner',
+    'nasal',
+    'place',
+    'retroflex',
+    'syllabic',
+    'voice',
 ]
 # 'high' taken out of R_v because same as manner
 R_v = [
-    "back",
-    "lateral",
-    "long",
-    "manner",
-    "nasal",
-    "place",
-    "retroflex",
-    "round",
-    "syllabic",
-    "voice",
+    'back',
+    'lateral',
+    'long',
+    'manner',
+    'nasal',
+    'place',
+    'retroflex',
+    'round',
+    'syllabic',
+    'voice',
 ]
 
 # Flattened feature matrix (Kondrak 2002: 56)
 similarity_matrix = {
     # place
-    "bilabial": 1.0,
-    "labiodental": 0.95,
-    "dental": 0.9,
-    "alveolar": 0.85,
-    "retroflex": 0.8,
-    "palato-alveolar": 0.75,
-    "palatal": 0.7,
-    "velar": 0.6,
-    "uvular": 0.5,
-    "pharyngeal": 0.3,
-    "glottal": 0.1,
-    "labiovelar": 1.0,
-    "vowel": -1.0,  # added 'vowel'
+    'bilabial': 1.0,
+    'labiodental': 0.95,
+    'dental': 0.9,
+    'alveolar': 0.85,
+    'retroflex': 0.8,
+    'palato-alveolar': 0.75,
+    'palatal': 0.7,
+    'velar': 0.6,
+    'uvular': 0.5,
+    'pharyngeal': 0.3,
+    'glottal': 0.1,
+    'labiovelar': 1.0,
+    'vowel': -1.0,  # added 'vowel'
     # manner
-    "stop": 1.0,
-    "affricate": 0.9,
-    "fricative": 0.85,  # increased fricative from 0.8
-    "trill": 0.7,
-    "tap": 0.65,
-    "approximant": 0.6,
-    "high vowel": 0.4,
-    "mid vowel": 0.2,
-    "low vowel": 0.0,
-    "vowel2": 0.5,  # added vowel
+    'stop': 1.0,
+    'affricate': 0.9,
+    'fricative': 0.85,  # increased fricative from 0.8
+    'trill': 0.7,
+    'tap': 0.65,
+    'approximant': 0.6,
+    'high vowel': 0.4,
+    'mid vowel': 0.2,
+    'low vowel': 0.0,
+    'vowel2': 0.5,  # added vowel
     # high
-    "high": 1.0,
-    "mid": 0.5,
-    "low": 0.0,
+    'high': 1.0,
+    'mid': 0.5,
+    'low': 0.0,
     # back
-    "front": 1.0,
-    "central": 0.5,
-    "back": 0.0,
+    'front': 1.0,
+    'central': 0.5,
+    'back': 0.0,
     # binary features
-    "plus": 1.0,
-    "minus": 0.0,
+    'plus': 1.0,
+    'minus': 0.0,
 }
 
 # Relative weights of phonetic features (Kondrak 2002: 55)
 salience = {
-    "syllabic": 5,
-    "place": 40,
-    "manner": 50,
-    "voice": 5,  # decreased from 10
-    "nasal": 20,  # increased from 10
-    "retroflex": 10,
-    "lateral": 10,
-    "aspirated": 5,
-    "long": 0,  # decreased from 1
-    "high": 3,  # decreased from 5
-    "back": 2,  # decreased from 5
-    "round": 2,  # decreased from 5
+    'syllabic': 5,
+    'place': 40,
+    'manner': 50,
+    'voice': 5,  # decreased from 10
+    'nasal': 20,  # increased from 10
+    'retroflex': 10,
+    'lateral': 10,
+    'aspirated': 5,
+    'long': 0,  # decreased from 1
+    'high': 3,  # decreased from 5
+    'back': 2,  # decreased from 5
+    'round': 2,  # decreased from 5
 }
 
 # (Kondrak 2002: 59-60)
 feature_matrix = {
     # Consonants
-    "p": {
-        "place": "bilabial",
-        "manner": "stop",
-        "syllabic": "minus",
-        "voice": "minus",
-        "nasal": "minus",
-        "retroflex": "minus",
-        "lateral": "minus",
-        "aspirated": "minus",
-    },
-    "b": {
-        "place": "bilabial",
-        "manner": "stop",
-        "syllabic": "minus",
-        "voice": "plus",
-        "nasal": "minus",
-        "retroflex": "minus",
-        "lateral": "minus",
-        "aspirated": "minus",
-    },
-    "t": {
-        "place": "alveolar",
-        "manner": "stop",
-        "syllabic": "minus",
-        "voice": "minus",
-        "nasal": "minus",
-        "retroflex": "minus",
-        "lateral": "minus",
-        "aspirated": "minus",
-    },
-    "d": {
-        "place": "alveolar",
-        "manner": "stop",
-        "syllabic": "minus",
-        "voice": "plus",
-        "nasal": "minus",
-        "retroflex": "minus",
-        "lateral": "minus",
-        "aspirated": "minus",
-    },
-    "ʈ": {
-        "place": "retroflex",
-        "manner": "stop",
-        "syllabic": "minus",
-        "voice": "minus",
-        "nasal": "minus",
-        "retroflex": "plus",
-        "lateral": "minus",
-        "aspirated": "minus",
-    },
-    "ɖ": {
-        "place": "retroflex",
-        "manner": "stop",
-        "syllabic": "minus",
-        "voice": "plus",
-        "nasal": "minus",
-        "retroflex": "plus",
-        "lateral": "minus",
-        "aspirated": "minus",
-    },
-    "c": {
-        "place": "palatal",
-        "manner": "stop",
-        "syllabic": "minus",
-        "voice": "minus",
-        "nasal": "minus",
-        "retroflex": "minus",
-        "lateral": "minus",
-        "aspirated": "minus",
-    },
-    "ɟ": {
-        "place": "palatal",
-        "manner": "stop",
-        "syllabic": "minus",
-        "voice": "plus",
-        "nasal": "minus",
-        "retroflex": "minus",
-        "lateral": "minus",
-        "aspirated": "minus",
-    },
-    "k": {
-        "place": "velar",
-        "manner": "stop",
-        "syllabic": "minus",
-        "voice": "minus",
-        "nasal": "minus",
-        "retroflex": "minus",
-        "lateral": "minus",
-        "aspirated": "minus",
-    },
-    "g": {
-        "place": "velar",
-        "manner": "stop",
-        "syllabic": "minus",
-        "voice": "plus",
-        "nasal": "minus",
-        "retroflex": "minus",
-        "lateral": "minus",
-        "aspirated": "minus",
-    },
-    "q": {
-        "place": "uvular",
-        "manner": "stop",
-        "syllabic": "minus",
-        "voice": "minus",
-        "nasal": "minus",
-        "retroflex": "minus",
-        "lateral": "minus",
-        "aspirated": "minus",
-    },
-    "ɢ": {
-        "place": "uvular",
-        "manner": "stop",
-        "syllabic": "minus",
-        "voice": "plus",
-        "nasal": "minus",
-        "retroflex": "minus",
-        "lateral": "minus",
-        "aspirated": "minus",
-    },
-    "ʔ": {
-        "place": "glottal",
-        "manner": "stop",
-        "syllabic": "minus",
-        "voice": "minus",
-        "nasal": "minus",
-        "retroflex": "minus",
-        "lateral": "minus",
-        "aspirated": "minus",
-    },
-    "m": {
-        "place": "bilabial",
-        "manner": "stop",
-        "syllabic": "minus",
-        "voice": "plus",
-        "nasal": "plus",
-        "retroflex": "minus",
-        "lateral": "minus",
-        "aspirated": "minus",
-    },
-    "ɱ": {
-        "place": "labiodental",
-        "manner": "stop",
-        "syllabic": "minus",
-        "voice": "plus",
-        "nasal": "plus",
-        "retroflex": "minus",
-        "lateral": "minus",
-        "aspirated": "minus",
-    },
-    "n": {
-        "place": "alveolar",
-        "manner": "stop",
-        "syllabic": "minus",
-        "voice": "plus",
-        "nasal": "plus",
-        "retroflex": "minus",
-        "lateral": "minus",
-        "aspirated": "minus",
-    },
-    "ɳ": {
-        "place": "retroflex",
-        "manner": "stop",
-        "syllabic": "minus",
-        "voice": "plus",
-        "nasal": "plus",
-        "retroflex": "plus",
-        "lateral": "minus",
-        "aspirated": "minus",
-    },
-    "ɲ": {
-        "place": "palatal",
-        "manner": "stop",
-        "syllabic": "minus",
-        "voice": "plus",
-        "nasal": "plus",
-        "retroflex": "minus",
-        "lateral": "minus",
-        "aspirated": "minus",
-    },
-    "ŋ": {
-        "place": "velar",
-        "manner": "stop",
-        "syllabic": "minus",
-        "voice": "plus",
-        "nasal": "plus",
-        "retroflex": "minus",
-        "lateral": "minus",
-        "aspirated": "minus",
-    },
-    "ɴ": {
-        "place": "uvular",
-        "manner": "stop",
-        "syllabic": "minus",
-        "voice": "plus",
-        "nasal": "plus",
-        "retroflex": "minus",
-        "lateral": "minus",
-        "aspirated": "minus",
-    },
-    "N": {
-        "place": "uvular",
-        "manner": "stop",
-        "syllabic": "minus",
-        "voice": "plus",
-        "nasal": "plus",
-        "retroflex": "minus",
-        "lateral": "minus",
-        "aspirated": "minus",
-    },
-    "ʙ": {
-        "place": "bilabial",
-        "manner": "trill",
-        "syllabic": "minus",
-        "voice": "plus",
-        "nasal": "minus",
-        "retroflex": "minus",
-        "lateral": "minus",
-        "aspirated": "minus",
-    },
-    "B": {
-        "place": "bilabial",
-        "manner": "trill",
-        "syllabic": "minus",
-        "voice": "plus",
-        "nasal": "minus",
-        "retroflex": "minus",
-        "lateral": "minus",
-        "aspirated": "minus",
-    },
-    "r": {
-        "place": "alveolar",
-        "manner": "trill",
-        "syllabic": "minus",
-        "voice": "plus",
-        "nasal": "minus",
-        "retroflex": "plus",
-        "lateral": "minus",
-        "aspirated": "minus",
-    },
-    "ʀ": {
-        "place": "uvular",
-        "manner": "trill",
-        "syllabic": "minus",
-        "voice": "plus",
-        "nasal": "minus",
-        "retroflex": "minus",
-        "lateral": "minus",
-        "aspirated": "minus",
-    },
-    "R": {
-        "place": "uvular",
-        "manner": "trill",
-        "syllabic": "minus",
-        "voice": "plus",
-        "nasal": "minus",
-        "retroflex": "minus",
-        "lateral": "minus",
-        "aspirated": "minus",
-    },
-    "ɾ": {
-        "place": "alveolar",
-        "manner": "tap",
-        "syllabic": "minus",
-        "voice": "plus",
-        "nasal": "minus",
-        "retroflex": "minus",
-        "lateral": "minus",
-        "aspirated": "minus",
-    },
-    "ɽ": {
-        "place": "retroflex",
-        "manner": "tap",
-        "syllabic": "minus",
-        "voice": "plus",
-        "nasal": "minus",
-        "retroflex": "plus",
-        "lateral": "minus",
-        "aspirated": "minus",
-    },
-    "ɸ": {
-        "place": "bilabial",
-        "manner": "fricative",
-        "syllabic": "minus",
-        "voice": "minus",
-        "nasal": "minus",
-        "retroflex": "minus",
-        "lateral": "minus",
-        "aspirated": "minus",
-    },
-    "β": {
-        "place": "bilabial",
-        "manner": "fricative",
-        "syllabic": "minus",
-        "voice": "plus",
-        "nasal": "minus",
-        "retroflex": "minus",
-        "lateral": "minus",
-        "aspirated": "minus",
-    },
-    "f": {
-        "place": "labiodental",
-        "manner": "fricative",
-        "syllabic": "minus",
-        "voice": "minus",
-        "nasal": "minus",
-        "retroflex": "minus",
-        "lateral": "minus",
-        "aspirated": "minus",
-    },
-    "v": {
-        "place": "labiodental",
-        "manner": "fricative",
-        "syllabic": "minus",
-        "voice": "plus",
-        "nasal": "minus",
-        "retroflex": "minus",
-        "lateral": "minus",
-        "aspirated": "minus",
-    },
-    "θ": {
-        "place": "dental",
-        "manner": "fricative",
-        "syllabic": "minus",
-        "voice": "minus",
-        "nasal": "minus",
-        "retroflex": "minus",
-        "lateral": "minus",
-        "aspirated": "minus",
-    },
-    "ð": {
-        "place": "dental",
-        "manner": "fricative",
-        "syllabic": "minus",
-        "voice": "plus",
-        "nasal": "minus",
-        "retroflex": "minus",
-        "lateral": "minus",
-        "aspirated": "minus",
-    },
-    "s": {
-        "place": "alveolar",
-        "manner": "fricative",
-        "syllabic": "minus",
-        "voice": "minus",
-        "nasal": "minus",
-        "retroflex": "minus",
-        "lateral": "minus",
-        "aspirated": "minus",
-    },
-    "z": {
-        "place": "alveolar",
-        "manner": "fricative",
-        "syllabic": "minus",
-        "voice": "plus",
-        "nasal": "minus",
-        "retroflex": "minus",
-        "lateral": "minus",
-        "aspirated": "minus",
-    },
-    "ʃ": {
-        "place": "palato-alveolar",
-        "manner": "fricative",
-        "syllabic": "minus",
-        "voice": "minus",
-        "nasal": "minus",
-        "retroflex": "minus",
-        "lateral": "minus",
-        "aspirated": "minus",
-    },
-    "ʒ": {
-        "place": "palato-alveolar",
-        "manner": "fricative",
-        "syllabic": "minus",
-        "voice": "plus",
-        "nasal": "minus",
-        "retroflex": "minus",
-        "lateral": "minus",
-        "aspirated": "minus",
-    },
-    "ʂ": {
-        "place": "retroflex",
-        "manner": "fricative",
-        "syllabic": "minus",
-        "voice": "minus",
-        "nasal": "minus",
-        "retroflex": "plus",
-        "lateral": "minus",
-        "aspirated": "minus",
-    },
-    "ʐ": {
-        "place": "retroflex",
-        "manner": "fricative",
-        "syllabic": "minus",
-        "voice": "plus",
-        "nasal": "minus",
-        "retroflex": "plus",
-        "lateral": "minus",
-        "aspirated": "minus",
-    },
-    "ç": {
-        "place": "palatal",
-        "manner": "fricative",
-        "syllabic": "minus",
-        "voice": "minus",
-        "nasal": "minus",
-        "retroflex": "minus",
-        "lateral": "minus",
-        "aspirated": "minus",
-    },
-    "ʝ": {
-        "place": "palatal",
-        "manner": "fricative",
-        "syllabic": "minus",
-        "voice": "plus",
-        "nasal": "minus",
-        "retroflex": "minus",
-        "lateral": "minus",
-        "aspirated": "minus",
-    },
-    "x": {
-        "place": "velar",
-        "manner": "fricative",
-        "syllabic": "minus",
-        "voice": "minus",
-        "nasal": "minus",
-        "retroflex": "minus",
-        "lateral": "minus",
-        "aspirated": "minus",
-    },
-    "ɣ": {
-        "place": "velar",
-        "manner": "fricative",
-        "syllabic": "minus",
-        "voice": "plus",
-        "nasal": "minus",
-        "retroflex": "minus",
-        "lateral": "minus",
-        "aspirated": "minus",
-    },
-    "χ": {
-        "place": "uvular",
-        "manner": "fricative",
-        "syllabic": "minus",
-        "voice": "minus",
-        "nasal": "minus",
-        "retroflex": "minus",
-        "lateral": "minus",
-        "aspirated": "minus",
-    },
-    "ʁ": {
-        "place": "uvular",
-        "manner": "fricative",
-        "syllabic": "minus",
-        "voice": "plus",
-        "nasal": "minus",
-        "retroflex": "minus",
-        "lateral": "minus",
-        "aspirated": "minus",
-    },
-    "ħ": {
-        "place": "pharyngeal",
-        "manner": "fricative",
-        "syllabic": "minus",
-        "voice": "minus",
-        "nasal": "minus",
-        "retroflex": "minus",
-        "lateral": "minus",
-        "aspirated": "minus",
-    },
-    "ʕ": {
-        "place": "pharyngeal",
-        "manner": "fricative",
-        "syllabic": "minus",
-        "voice": "plus",
-        "nasal": "minus",
-        "retroflex": "minus",
-        "lateral": "minus",
-        "aspirated": "minus",
-    },
-    "h": {
-        "place": "glottal",
-        "manner": "fricative",
-        "syllabic": "minus",
-        "voice": "minus",
-        "nasal": "minus",
-        "retroflex": "minus",
-        "lateral": "minus",
-        "aspirated": "minus",
-    },
-    "ɦ": {
-        "place": "glottal",
-        "manner": "fricative",
-        "syllabic": "minus",
-        "voice": "plus",
-        "nasal": "minus",
-        "retroflex": "minus",
-        "lateral": "minus",
-        "aspirated": "minus",
-    },
-    "ɬ": {
-        "place": "alveolar",
-        "manner": "fricative",
-        "syllabic": "minus",
-        "voice": "minus",
-        "nasal": "minus",
-        "retroflex": "minus",
-        "lateral": "plus",
-        "aspirated": "minus",
-    },
-    "ɮ": {
-        "place": "alveolar",
-        "manner": "fricative",
-        "syllabic": "minus",
-        "voice": "plus",
-        "nasal": "minus",
-        "retroflex": "minus",
-        "lateral": "plus",
-        "aspirated": "minus",
-    },
-    "ʋ": {
-        "place": "labiodental",
-        "manner": "approximant",
-        "syllabic": "minus",
-        "voice": "plus",
-        "nasal": "minus",
-        "retroflex": "minus",
-        "lateral": "minus",
-        "aspirated": "minus",
-    },
-    "ɹ": {
-        "place": "alveolar",
-        "manner": "approximant",
-        "syllabic": "minus",
-        "voice": "plus",
-        "nasal": "minus",
-        "retroflex": "minus",
-        "lateral": "minus",
-        "aspirated": "minus",
-    },
-    "ɻ": {
-        "place": "retroflex",
-        "manner": "approximant",
-        "syllabic": "minus",
-        "voice": "plus",
-        "nasal": "minus",
-        "retroflex": "plus",
-        "lateral": "minus",
-        "aspirated": "minus",
-    },
-    "j": {
-        "place": "palatal",
-        "manner": "approximant",
-        "syllabic": "minus",
-        "voice": "plus",
-        "nasal": "minus",
-        "retroflex": "minus",
-        "lateral": "minus",
-        "aspirated": "minus",
-    },
-    "ɰ": {
-        "place": "velar",
-        "manner": "approximant",
-        "syllabic": "minus",
-        "voice": "plus",
-        "nasal": "minus",
-        "retroflex": "minus",
-        "lateral": "minus",
-        "aspirated": "minus",
-    },
-    "l": {
-        "place": "alveolar",
-        "manner": "approximant",
-        "syllabic": "minus",
-        "voice": "plus",
-        "nasal": "minus",
-        "retroflex": "minus",
-        "lateral": "plus",
-        "aspirated": "minus",
-    },
-    "w": {
-        "place": "labiovelar",
-        "manner": "approximant",
-        "syllabic": "minus",
-        "voice": "plus",
-        "nasal": "minus",
-        "retroflex": "minus",
-        "lateral": "minus",
-        "aspirated": "minus",
+    'p': {
+        'place': 'bilabial',
+        'manner': 'stop',
+        'syllabic': 'minus',
+        'voice': 'minus',
+        'nasal': 'minus',
+        'retroflex': 'minus',
+        'lateral': 'minus',
+        'aspirated': 'minus',
+    },
+    'b': {
+        'place': 'bilabial',
+        'manner': 'stop',
+        'syllabic': 'minus',
+        'voice': 'plus',
+        'nasal': 'minus',
+        'retroflex': 'minus',
+        'lateral': 'minus',
+        'aspirated': 'minus',
+    },
+    't': {
+        'place': 'alveolar',
+        'manner': 'stop',
+        'syllabic': 'minus',
+        'voice': 'minus',
+        'nasal': 'minus',
+        'retroflex': 'minus',
+        'lateral': 'minus',
+        'aspirated': 'minus',
+    },
+    'd': {
+        'place': 'alveolar',
+        'manner': 'stop',
+        'syllabic': 'minus',
+        'voice': 'plus',
+        'nasal': 'minus',
+        'retroflex': 'minus',
+        'lateral': 'minus',
+        'aspirated': 'minus',
+    },
+    'ʈ': {
+        'place': 'retroflex',
+        'manner': 'stop',
+        'syllabic': 'minus',
+        'voice': 'minus',
+        'nasal': 'minus',
+        'retroflex': 'plus',
+        'lateral': 'minus',
+        'aspirated': 'minus',
+    },
+    'ɖ': {
+        'place': 'retroflex',
+        'manner': 'stop',
+        'syllabic': 'minus',
+        'voice': 'plus',
+        'nasal': 'minus',
+        'retroflex': 'plus',
+        'lateral': 'minus',
+        'aspirated': 'minus',
+    },
+    'c': {
+        'place': 'palatal',
+        'manner': 'stop',
+        'syllabic': 'minus',
+        'voice': 'minus',
+        'nasal': 'minus',
+        'retroflex': 'minus',
+        'lateral': 'minus',
+        'aspirated': 'minus',
+    },
+    'ɟ': {
+        'place': 'palatal',
+        'manner': 'stop',
+        'syllabic': 'minus',
+        'voice': 'plus',
+        'nasal': 'minus',
+        'retroflex': 'minus',
+        'lateral': 'minus',
+        'aspirated': 'minus',
+    },
+    'k': {
+        'place': 'velar',
+        'manner': 'stop',
+        'syllabic': 'minus',
+        'voice': 'minus',
+        'nasal': 'minus',
+        'retroflex': 'minus',
+        'lateral': 'minus',
+        'aspirated': 'minus',
+    },
+    'g': {
+        'place': 'velar',
+        'manner': 'stop',
+        'syllabic': 'minus',
+        'voice': 'plus',
+        'nasal': 'minus',
+        'retroflex': 'minus',
+        'lateral': 'minus',
+        'aspirated': 'minus',
+    },
+    'q': {
+        'place': 'uvular',
+        'manner': 'stop',
+        'syllabic': 'minus',
+        'voice': 'minus',
+        'nasal': 'minus',
+        'retroflex': 'minus',
+        'lateral': 'minus',
+        'aspirated': 'minus',
+    },
+    'ɢ': {
+        'place': 'uvular',
+        'manner': 'stop',
+        'syllabic': 'minus',
+        'voice': 'plus',
+        'nasal': 'minus',
+        'retroflex': 'minus',
+        'lateral': 'minus',
+        'aspirated': 'minus',
+    },
+    'ʔ': {
+        'place': 'glottal',
+        'manner': 'stop',
+        'syllabic': 'minus',
+        'voice': 'minus',
+        'nasal': 'minus',
+        'retroflex': 'minus',
+        'lateral': 'minus',
+        'aspirated': 'minus',
+    },
+    'm': {
+        'place': 'bilabial',
+        'manner': 'stop',
+        'syllabic': 'minus',
+        'voice': 'plus',
+        'nasal': 'plus',
+        'retroflex': 'minus',
+        'lateral': 'minus',
+        'aspirated': 'minus',
+    },
+    'ɱ': {
+        'place': 'labiodental',
+        'manner': 'stop',
+        'syllabic': 'minus',
+        'voice': 'plus',
+        'nasal': 'plus',
+        'retroflex': 'minus',
+        'lateral': 'minus',
+        'aspirated': 'minus',
+    },
+    'n': {
+        'place': 'alveolar',
+        'manner': 'stop',
+        'syllabic': 'minus',
+        'voice': 'plus',
+        'nasal': 'plus',
+        'retroflex': 'minus',
+        'lateral': 'minus',
+        'aspirated': 'minus',
+    },
+    'ɳ': {
+        'place': 'retroflex',
+        'manner': 'stop',
+        'syllabic': 'minus',
+        'voice': 'plus',
+        'nasal': 'plus',
+        'retroflex': 'plus',
+        'lateral': 'minus',
+        'aspirated': 'minus',
+    },
+    'ɲ': {
+        'place': 'palatal',
+        'manner': 'stop',
+        'syllabic': 'minus',
+        'voice': 'plus',
+        'nasal': 'plus',
+        'retroflex': 'minus',
+        'lateral': 'minus',
+        'aspirated': 'minus',
+    },
+    'ŋ': {
+        'place': 'velar',
+        'manner': 'stop',
+        'syllabic': 'minus',
+        'voice': 'plus',
+        'nasal': 'plus',
+        'retroflex': 'minus',
+        'lateral': 'minus',
+        'aspirated': 'minus',
+    },
+    'ɴ': {
+        'place': 'uvular',
+        'manner': 'stop',
+        'syllabic': 'minus',
+        'voice': 'plus',
+        'nasal': 'plus',
+        'retroflex': 'minus',
+        'lateral': 'minus',
+        'aspirated': 'minus',
+    },
+    'N': {
+        'place': 'uvular',
+        'manner': 'stop',
+        'syllabic': 'minus',
+        'voice': 'plus',
+        'nasal': 'plus',
+        'retroflex': 'minus',
+        'lateral': 'minus',
+        'aspirated': 'minus',
+    },
+    'ʙ': {
+        'place': 'bilabial',
+        'manner': 'trill',
+        'syllabic': 'minus',
+        'voice': 'plus',
+        'nasal': 'minus',
+        'retroflex': 'minus',
+        'lateral': 'minus',
+        'aspirated': 'minus',
+    },
+    'B': {
+        'place': 'bilabial',
+        'manner': 'trill',
+        'syllabic': 'minus',
+        'voice': 'plus',
+        'nasal': 'minus',
+        'retroflex': 'minus',
+        'lateral': 'minus',
+        'aspirated': 'minus',
+    },
+    'r': {
+        'place': 'alveolar',
+        'manner': 'trill',
+        'syllabic': 'minus',
+        'voice': 'plus',
+        'nasal': 'minus',
+        'retroflex': 'plus',
+        'lateral': 'minus',
+        'aspirated': 'minus',
+    },
+    'ʀ': {
+        'place': 'uvular',
+        'manner': 'trill',
+        'syllabic': 'minus',
+        'voice': 'plus',
+        'nasal': 'minus',
+        'retroflex': 'minus',
+        'lateral': 'minus',
+        'aspirated': 'minus',
+    },
+    'R': {
+        'place': 'uvular',
+        'manner': 'trill',
+        'syllabic': 'minus',
+        'voice': 'plus',
+        'nasal': 'minus',
+        'retroflex': 'minus',
+        'lateral': 'minus',
+        'aspirated': 'minus',
+    },
+    'ɾ': {
+        'place': 'alveolar',
+        'manner': 'tap',
+        'syllabic': 'minus',
+        'voice': 'plus',
+        'nasal': 'minus',
+        'retroflex': 'minus',
+        'lateral': 'minus',
+        'aspirated': 'minus',
+    },
+    'ɽ': {
+        'place': 'retroflex',
+        'manner': 'tap',
+        'syllabic': 'minus',
+        'voice': 'plus',
+        'nasal': 'minus',
+        'retroflex': 'plus',
+        'lateral': 'minus',
+        'aspirated': 'minus',
+    },
+    'ɸ': {
+        'place': 'bilabial',
+        'manner': 'fricative',
+        'syllabic': 'minus',
+        'voice': 'minus',
+        'nasal': 'minus',
+        'retroflex': 'minus',
+        'lateral': 'minus',
+        'aspirated': 'minus',
+    },
+    'β': {
+        'place': 'bilabial',
+        'manner': 'fricative',
+        'syllabic': 'minus',
+        'voice': 'plus',
+        'nasal': 'minus',
+        'retroflex': 'minus',
+        'lateral': 'minus',
+        'aspirated': 'minus',
+    },
+    'f': {
+        'place': 'labiodental',
+        'manner': 'fricative',
+        'syllabic': 'minus',
+        'voice': 'minus',
+        'nasal': 'minus',
+        'retroflex': 'minus',
+        'lateral': 'minus',
+        'aspirated': 'minus',
+    },
+    'v': {
+        'place': 'labiodental',
+        'manner': 'fricative',
+        'syllabic': 'minus',
+        'voice': 'plus',
+        'nasal': 'minus',
+        'retroflex': 'minus',
+        'lateral': 'minus',
+        'aspirated': 'minus',
+    },
+    'θ': {
+        'place': 'dental',
+        'manner': 'fricative',
+        'syllabic': 'minus',
+        'voice': 'minus',
+        'nasal': 'minus',
+        'retroflex': 'minus',
+        'lateral': 'minus',
+        'aspirated': 'minus',
+    },
+    'ð': {
+        'place': 'dental',
+        'manner': 'fricative',
+        'syllabic': 'minus',
+        'voice': 'plus',
+        'nasal': 'minus',
+        'retroflex': 'minus',
+        'lateral': 'minus',
+        'aspirated': 'minus',
+    },
+    's': {
+        'place': 'alveolar',
+        'manner': 'fricative',
+        'syllabic': 'minus',
+        'voice': 'minus',
+        'nasal': 'minus',
+        'retroflex': 'minus',
+        'lateral': 'minus',
+        'aspirated': 'minus',
+    },
+    'z': {
+        'place': 'alveolar',
+        'manner': 'fricative',
+        'syllabic': 'minus',
+        'voice': 'plus',
+        'nasal': 'minus',
+        'retroflex': 'minus',
+        'lateral': 'minus',
+        'aspirated': 'minus',
+    },
+    'ʃ': {
+        'place': 'palato-alveolar',
+        'manner': 'fricative',
+        'syllabic': 'minus',
+        'voice': 'minus',
+        'nasal': 'minus',
+        'retroflex': 'minus',
+        'lateral': 'minus',
+        'aspirated': 'minus',
+    },
+    'ʒ': {
+        'place': 'palato-alveolar',
+        'manner': 'fricative',
+        'syllabic': 'minus',
+        'voice': 'plus',
+        'nasal': 'minus',
+        'retroflex': 'minus',
+        'lateral': 'minus',
+        'aspirated': 'minus',
+    },
+    'ʂ': {
+        'place': 'retroflex',
+        'manner': 'fricative',
+        'syllabic': 'minus',
+        'voice': 'minus',
+        'nasal': 'minus',
+        'retroflex': 'plus',
+        'lateral': 'minus',
+        'aspirated': 'minus',
+    },
+    'ʐ': {
+        'place': 'retroflex',
+        'manner': 'fricative',
+        'syllabic': 'minus',
+        'voice': 'plus',
+        'nasal': 'minus',
+        'retroflex': 'plus',
+        'lateral': 'minus',
+        'aspirated': 'minus',
+    },
+    'ç': {
+        'place': 'palatal',
+        'manner': 'fricative',
+        'syllabic': 'minus',
+        'voice': 'minus',
+        'nasal': 'minus',
+        'retroflex': 'minus',
+        'lateral': 'minus',
+        'aspirated': 'minus',
+    },
+    'ʝ': {
+        'place': 'palatal',
+        'manner': 'fricative',
+        'syllabic': 'minus',
+        'voice': 'plus',
+        'nasal': 'minus',
+        'retroflex': 'minus',
+        'lateral': 'minus',
+        'aspirated': 'minus',
+    },
+    'x': {
+        'place': 'velar',
+        'manner': 'fricative',
+        'syllabic': 'minus',
+        'voice': 'minus',
+        'nasal': 'minus',
+        'retroflex': 'minus',
+        'lateral': 'minus',
+        'aspirated': 'minus',
+    },
+    'ɣ': {
+        'place': 'velar',
+        'manner': 'fricative',
+        'syllabic': 'minus',
+        'voice': 'plus',
+        'nasal': 'minus',
+        'retroflex': 'minus',
+        'lateral': 'minus',
+        'aspirated': 'minus',
+    },
+    'χ': {
+        'place': 'uvular',
+        'manner': 'fricative',
+        'syllabic': 'minus',
+        'voice': 'minus',
+        'nasal': 'minus',
+        'retroflex': 'minus',
+        'lateral': 'minus',
+        'aspirated': 'minus',
+    },
+    'ʁ': {
+        'place': 'uvular',
+        'manner': 'fricative',
+        'syllabic': 'minus',
+        'voice': 'plus',
+        'nasal': 'minus',
+        'retroflex': 'minus',
+        'lateral': 'minus',
+        'aspirated': 'minus',
+    },
+    'ħ': {
+        'place': 'pharyngeal',
+        'manner': 'fricative',
+        'syllabic': 'minus',
+        'voice': 'minus',
+        'nasal': 'minus',
+        'retroflex': 'minus',
+        'lateral': 'minus',
+        'aspirated': 'minus',
+    },
+    'ʕ': {
+        'place': 'pharyngeal',
+        'manner': 'fricative',
+        'syllabic': 'minus',
+        'voice': 'plus',
+        'nasal': 'minus',
+        'retroflex': 'minus',
+        'lateral': 'minus',
+        'aspirated': 'minus',
+    },
+    'h': {
+        'place': 'glottal',
+        'manner': 'fricative',
+        'syllabic': 'minus',
+        'voice': 'minus',
+        'nasal': 'minus',
+        'retroflex': 'minus',
+        'lateral': 'minus',
+        'aspirated': 'minus',
+    },
+    'ɦ': {
+        'place': 'glottal',
+        'manner': 'fricative',
+        'syllabic': 'minus',
+        'voice': 'plus',
+        'nasal': 'minus',
+        'retroflex': 'minus',
+        'lateral': 'minus',
+        'aspirated': 'minus',
+    },
+    'ɬ': {
+        'place': 'alveolar',
+        'manner': 'fricative',
+        'syllabic': 'minus',
+        'voice': 'minus',
+        'nasal': 'minus',
+        'retroflex': 'minus',
+        'lateral': 'plus',
+        'aspirated': 'minus',
+    },
+    'ɮ': {
+        'place': 'alveolar',
+        'manner': 'fricative',
+        'syllabic': 'minus',
+        'voice': 'plus',
+        'nasal': 'minus',
+        'retroflex': 'minus',
+        'lateral': 'plus',
+        'aspirated': 'minus',
+    },
+    'ʋ': {
+        'place': 'labiodental',
+        'manner': 'approximant',
+        'syllabic': 'minus',
+        'voice': 'plus',
+        'nasal': 'minus',
+        'retroflex': 'minus',
+        'lateral': 'minus',
+        'aspirated': 'minus',
+    },
+    'ɹ': {
+        'place': 'alveolar',
+        'manner': 'approximant',
+        'syllabic': 'minus',
+        'voice': 'plus',
+        'nasal': 'minus',
+        'retroflex': 'minus',
+        'lateral': 'minus',
+        'aspirated': 'minus',
+    },
+    'ɻ': {
+        'place': 'retroflex',
+        'manner': 'approximant',
+        'syllabic': 'minus',
+        'voice': 'plus',
+        'nasal': 'minus',
+        'retroflex': 'plus',
+        'lateral': 'minus',
+        'aspirated': 'minus',
+    },
+    'j': {
+        'place': 'palatal',
+        'manner': 'approximant',
+        'syllabic': 'minus',
+        'voice': 'plus',
+        'nasal': 'minus',
+        'retroflex': 'minus',
+        'lateral': 'minus',
+        'aspirated': 'minus',
+    },
+    'ɰ': {
+        'place': 'velar',
+        'manner': 'approximant',
+        'syllabic': 'minus',
+        'voice': 'plus',
+        'nasal': 'minus',
+        'retroflex': 'minus',
+        'lateral': 'minus',
+        'aspirated': 'minus',
+    },
+    'l': {
+        'place': 'alveolar',
+        'manner': 'approximant',
+        'syllabic': 'minus',
+        'voice': 'plus',
+        'nasal': 'minus',
+        'retroflex': 'minus',
+        'lateral': 'plus',
+        'aspirated': 'minus',
+    },
+    'w': {
+        'place': 'labiovelar',
+        'manner': 'approximant',
+        'syllabic': 'minus',
+        'voice': 'plus',
+        'nasal': 'minus',
+        'retroflex': 'minus',
+        'lateral': 'minus',
+        'aspirated': 'minus',
     },
     # Vowels
-    "i": {
-        "place": "vowel",
-        "manner": "vowel2",
-        "syllabic": "plus",
-        "voice": "plus",
-        "nasal": "minus",
-        "retroflex": "minus",
-        "lateral": "minus",
-        "high": "high",
-        "back": "front",
-        "round": "minus",
-        "long": "minus",
-        "aspirated": "minus",
-    },
-    "y": {
-        "place": "vowel",
-        "manner": "vowel2",
-        "syllabic": "plus",
-        "voice": "plus",
-        "nasal": "minus",
-        "retroflex": "minus",
-        "lateral": "minus",
-        "high": "high",
-        "back": "front",
-        "round": "plus",
-        "long": "minus",
-        "aspirated": "minus",
-    },
-    "e": {
-        "place": "vowel",
-        "manner": "vowel2",
-        "syllabic": "plus",
-        "voice": "plus",
-        "nasal": "minus",
-        "retroflex": "minus",
-        "lateral": "minus",
-        "high": "mid",
-        "back": "front",
-        "round": "minus",
-        "long": "minus",
-        "aspirated": "minus",
-    },
-    "E": {
-        "place": "vowel",
-        "manner": "vowel2",
-        "syllabic": "plus",
-        "voice": "plus",
-        "nasal": "minus",
-        "retroflex": "minus",
-        "lateral": "minus",
-        "high": "mid",
-        "back": "front",
-        "round": "minus",
-        "long": "plus",
-        "aspirated": "minus",
-    },
-    "ø": {
-        "place": "vowel",
-        "manner": "vowel2",
-        "syllabic": "plus",
-        "voice": "plus",
-        "nasal": "minus",
-        "retroflex": "minus",
-        "lateral": "minus",
-        "high": "mid",
-        "back": "front",
-        "round": "plus",
-        "long": "minus",
-        "aspirated": "minus",
-    },
-    "ɛ": {
-        "place": "vowel",
-        "manner": "vowel2",
-        "syllabic": "plus",
-        "voice": "plus",
-        "nasal": "minus",
-        "retroflex": "minus",
-        "lateral": "minus",
-        "high": "mid",
-        "back": "front",
-        "round": "minus",
-        "long": "minus",
-        "aspirated": "minus",
-    },
-    "œ": {
-        "place": "vowel",
-        "manner": "vowel2",
-        "syllabic": "plus",
-        "voice": "plus",
-        "nasal": "minus",
-        "retroflex": "minus",
-        "lateral": "minus",
-        "high": "mid",
-        "back": "front",
-        "round": "plus",
-        "long": "minus",
-        "aspirated": "minus",
-    },
-    "æ": {
-        "place": "vowel",
-        "manner": "vowel2",
-        "syllabic": "plus",
-        "voice": "plus",
-        "nasal": "minus",
-        "retroflex": "minus",
-        "lateral": "minus",
-        "high": "low",
-        "back": "front",
-        "round": "minus",
-        "long": "minus",
-        "aspirated": "minus",
-    },
-    "a": {
-        "place": "vowel",
-        "manner": "vowel2",
-        "syllabic": "plus",
-        "voice": "plus",
-        "nasal": "minus",
-        "retroflex": "minus",
-        "lateral": "minus",
-        "high": "low",
-        "back": "front",
-        "round": "minus",
-        "long": "minus",
-        "aspirated": "minus",
-    },
-    "A": {
-        "place": "vowel",
-        "manner": "vowel2",
-        "syllabic": "plus",
-        "voice": "plus",
-        "nasal": "minus",
-        "retroflex": "minus",
-        "lateral": "minus",
-        "high": "low",
-        "back": "front",
-        "round": "minus",
-        "long": "plus",
-        "aspirated": "minus",
-    },
-    "ɨ": {
-        "place": "vowel",
-        "manner": "vowel2",
-        "syllabic": "plus",
-        "voice": "plus",
-        "nasal": "minus",
-        "retroflex": "minus",
-        "lateral": "minus",
-        "high": "high",
-        "back": "central",
-        "round": "minus",
-        "long": "minus",
-        "aspirated": "minus",
-    },
-    "ʉ": {
-        "place": "vowel",
-        "manner": "vowel2",
-        "syllabic": "plus",
-        "voice": "plus",
-        "nasal": "minus",
-        "retroflex": "minus",
-        "lateral": "minus",
-        "high": "high",
-        "back": "central",
-        "round": "plus",
-        "long": "minus",
-        "aspirated": "minus",
-    },
-    "ə": {
-        "place": "vowel",
-        "manner": "vowel2",
-        "syllabic": "plus",
-        "voice": "plus",
-        "nasal": "minus",
-        "retroflex": "minus",
-        "lateral": "minus",
-        "high": "mid",
-        "back": "central",
-        "round": "minus",
-        "long": "minus",
-        "aspirated": "minus",
-    },
-    "u": {
-        "place": "vowel",
-        "manner": "vowel2",
-        "syllabic": "plus",
-        "voice": "plus",
-        "nasal": "minus",
-        "retroflex": "minus",
-        "lateral": "minus",
-        "high": "high",
-        "back": "back",
-        "round": "plus",
-        "long": "minus",
-        "aspirated": "minus",
-    },
-    "U": {
-        "place": "vowel",
-        "manner": "vowel2",
-        "syllabic": "plus",
-        "voice": "plus",
-        "nasal": "minus",
-        "retroflex": "minus",
-        "lateral": "minus",
-        "high": "high",
-        "back": "back",
-        "round": "plus",
-        "long": "plus",
-        "aspirated": "minus",
-    },
-    "o": {
-        "place": "vowel",
-        "manner": "vowel2",
-        "syllabic": "plus",
-        "voice": "plus",
-        "nasal": "minus",
-        "retroflex": "minus",
-        "lateral": "minus",
-        "high": "mid",
-        "back": "back",
-        "round": "plus",
-        "long": "minus",
-        "aspirated": "minus",
-    },
-    "O": {
-        "place": "vowel",
-        "manner": "vowel2",
-        "syllabic": "plus",
-        "voice": "plus",
-        "nasal": "minus",
-        "retroflex": "minus",
-        "lateral": "minus",
-        "high": "mid",
-        "back": "back",
-        "round": "plus",
-        "long": "plus",
-        "aspirated": "minus",
-    },
-    "ɔ": {
-        "place": "vowel",
-        "manner": "vowel2",
-        "syllabic": "plus",
-        "voice": "plus",
-        "nasal": "minus",
-        "retroflex": "minus",
-        "lateral": "minus",
-        "high": "mid",
-        "back": "back",
-        "round": "plus",
-        "long": "minus",
-        "aspirated": "minus",
-    },
-    "ɒ": {
-        "place": "vowel",
-        "manner": "vowel2",
-        "syllabic": "plus",
-        "voice": "plus",
-        "nasal": "minus",
-        "retroflex": "minus",
-        "lateral": "minus",
-        "high": "low",
-        "back": "back",
-        "round": "minus",
-        "long": "minus",
-        "aspirated": "minus",
-    },
-    "I": {
-        "place": "vowel",
-        "manner": "vowel2",
-        "syllabic": "plus",
-        "voice": "plus",
-        "nasal": "minus",
-        "retroflex": "minus",
-        "lateral": "minus",
-        "high": "high",
-        "back": "front",
-        "round": "minus",
-        "long": "plus",
-        "aspirated": "minus",
+    'i': {
+        'place': 'vowel',
+        'manner': 'vowel2',
+        'syllabic': 'plus',
+        'voice': 'plus',
+        'nasal': 'minus',
+        'retroflex': 'minus',
+        'lateral': 'minus',
+        'high': 'high',
+        'back': 'front',
+        'round': 'minus',
+        'long': 'minus',
+        'aspirated': 'minus',
+    },
+    'y': {
+        'place': 'vowel',
+        'manner': 'vowel2',
+        'syllabic': 'plus',
+        'voice': 'plus',
+        'nasal': 'minus',
+        'retroflex': 'minus',
+        'lateral': 'minus',
+        'high': 'high',
+        'back': 'front',
+        'round': 'plus',
+        'long': 'minus',
+        'aspirated': 'minus',
+    },
+    'e': {
+        'place': 'vowel',
+        'manner': 'vowel2',
+        'syllabic': 'plus',
+        'voice': 'plus',
+        'nasal': 'minus',
+        'retroflex': 'minus',
+        'lateral': 'minus',
+        'high': 'mid',
+        'back': 'front',
+        'round': 'minus',
+        'long': 'minus',
+        'aspirated': 'minus',
+    },
+    'E': {
+        'place': 'vowel',
+        'manner': 'vowel2',
+        'syllabic': 'plus',
+        'voice': 'plus',
+        'nasal': 'minus',
+        'retroflex': 'minus',
+        'lateral': 'minus',
+        'high': 'mid',
+        'back': 'front',
+        'round': 'minus',
+        'long': 'plus',
+        'aspirated': 'minus',
+    },
+    'ø': {
+        'place': 'vowel',
+        'manner': 'vowel2',
+        'syllabic': 'plus',
+        'voice': 'plus',
+        'nasal': 'minus',
+        'retroflex': 'minus',
+        'lateral': 'minus',
+        'high': 'mid',
+        'back': 'front',
+        'round': 'plus',
+        'long': 'minus',
+        'aspirated': 'minus',
+    },
+    'ɛ': {
+        'place': 'vowel',
+        'manner': 'vowel2',
+        'syllabic': 'plus',
+        'voice': 'plus',
+        'nasal': 'minus',
+        'retroflex': 'minus',
+        'lateral': 'minus',
+        'high': 'mid',
+        'back': 'front',
+        'round': 'minus',
+        'long': 'minus',
+        'aspirated': 'minus',
+    },
+    'œ': {
+        'place': 'vowel',
+        'manner': 'vowel2',
+        'syllabic': 'plus',
+        'voice': 'plus',
+        'nasal': 'minus',
+        'retroflex': 'minus',
+        'lateral': 'minus',
+        'high': 'mid',
+        'back': 'front',
+        'round': 'plus',
+        'long': 'minus',
+        'aspirated': 'minus',
+    },
+    'æ': {
+        'place': 'vowel',
+        'manner': 'vowel2',
+        'syllabic': 'plus',
+        'voice': 'plus',
+        'nasal': 'minus',
+        'retroflex': 'minus',
+        'lateral': 'minus',
+        'high': 'low',
+        'back': 'front',
+        'round': 'minus',
+        'long': 'minus',
+        'aspirated': 'minus',
+    },
+    'a': {
+        'place': 'vowel',
+        'manner': 'vowel2',
+        'syllabic': 'plus',
+        'voice': 'plus',
+        'nasal': 'minus',
+        'retroflex': 'minus',
+        'lateral': 'minus',
+        'high': 'low',
+        'back': 'front',
+        'round': 'minus',
+        'long': 'minus',
+        'aspirated': 'minus',
+    },
+    'A': {
+        'place': 'vowel',
+        'manner': 'vowel2',
+        'syllabic': 'plus',
+        'voice': 'plus',
+        'nasal': 'minus',
+        'retroflex': 'minus',
+        'lateral': 'minus',
+        'high': 'low',
+        'back': 'front',
+        'round': 'minus',
+        'long': 'plus',
+        'aspirated': 'minus',
+    },
+    'ɨ': {
+        'place': 'vowel',
+        'manner': 'vowel2',
+        'syllabic': 'plus',
+        'voice': 'plus',
+        'nasal': 'minus',
+        'retroflex': 'minus',
+        'lateral': 'minus',
+        'high': 'high',
+        'back': 'central',
+        'round': 'minus',
+        'long': 'minus',
+        'aspirated': 'minus',
+    },
+    'ʉ': {
+        'place': 'vowel',
+        'manner': 'vowel2',
+        'syllabic': 'plus',
+        'voice': 'plus',
+        'nasal': 'minus',
+        'retroflex': 'minus',
+        'lateral': 'minus',
+        'high': 'high',
+        'back': 'central',
+        'round': 'plus',
+        'long': 'minus',
+        'aspirated': 'minus',
+    },
+    'ə': {
+        'place': 'vowel',
+        'manner': 'vowel2',
+        'syllabic': 'plus',
+        'voice': 'plus',
+        'nasal': 'minus',
+        'retroflex': 'minus',
+        'lateral': 'minus',
+        'high': 'mid',
+        'back': 'central',
+        'round': 'minus',
+        'long': 'minus',
+        'aspirated': 'minus',
+    },
+    'u': {
+        'place': 'vowel',
+        'manner': 'vowel2',
+        'syllabic': 'plus',
+        'voice': 'plus',
+        'nasal': 'minus',
+        'retroflex': 'minus',
+        'lateral': 'minus',
+        'high': 'high',
+        'back': 'back',
+        'round': 'plus',
+        'long': 'minus',
+        'aspirated': 'minus',
+    },
+    'U': {
+        'place': 'vowel',
+        'manner': 'vowel2',
+        'syllabic': 'plus',
+        'voice': 'plus',
+        'nasal': 'minus',
+        'retroflex': 'minus',
+        'lateral': 'minus',
+        'high': 'high',
+        'back': 'back',
+        'round': 'plus',
+        'long': 'plus',
+        'aspirated': 'minus',
+    },
+    'o': {
+        'place': 'vowel',
+        'manner': 'vowel2',
+        'syllabic': 'plus',
+        'voice': 'plus',
+        'nasal': 'minus',
+        'retroflex': 'minus',
+        'lateral': 'minus',
+        'high': 'mid',
+        'back': 'back',
+        'round': 'plus',
+        'long': 'minus',
+        'aspirated': 'minus',
+    },
+    'O': {
+        'place': 'vowel',
+        'manner': 'vowel2',
+        'syllabic': 'plus',
+        'voice': 'plus',
+        'nasal': 'minus',
+        'retroflex': 'minus',
+        'lateral': 'minus',
+        'high': 'mid',
+        'back': 'back',
+        'round': 'plus',
+        'long': 'plus',
+        'aspirated': 'minus',
+    },
+    'ɔ': {
+        'place': 'vowel',
+        'manner': 'vowel2',
+        'syllabic': 'plus',
+        'voice': 'plus',
+        'nasal': 'minus',
+        'retroflex': 'minus',
+        'lateral': 'minus',
+        'high': 'mid',
+        'back': 'back',
+        'round': 'plus',
+        'long': 'minus',
+        'aspirated': 'minus',
+    },
+    'ɒ': {
+        'place': 'vowel',
+        'manner': 'vowel2',
+        'syllabic': 'plus',
+        'voice': 'plus',
+        'nasal': 'minus',
+        'retroflex': 'minus',
+        'lateral': 'minus',
+        'high': 'low',
+        'back': 'back',
+        'round': 'minus',
+        'long': 'minus',
+        'aspirated': 'minus',
+    },
+    'I': {
+        'place': 'vowel',
+        'manner': 'vowel2',
+        'syllabic': 'plus',
+        'voice': 'plus',
+        'nasal': 'minus',
+        'retroflex': 'minus',
+        'lateral': 'minus',
+        'high': 'high',
+        'back': 'front',
+        'round': 'minus',
+        'long': 'plus',
+        'aspirated': 'minus',
     },
 }
 
@@ -1091,7 +1093,7 @@ def align(str1, str2, epsilon=0):
     (Kondrak 2002: 51)
     """
     if np is None:
-        raise ImportError("You need numpy in order to use the align function")
+        raise ImportError('You need numpy in order to use the align function')
 
     assert 0.0 <= epsilon <= 1.0, "Epsilon must be between 0.0 and 1.0."
     m = len(str1)
@@ -1164,10 +1166,10 @@ def _retrieve(i, j, s, S, T, str1, str2, out):
                 out,
             )
         elif S[i, j - 1] + sigma_skip(str2[j - 1]) + s >= T:
-            out.insert(0, ("-", str2[j - 1]))
+            out.insert(0, ('-', str2[j - 1]))
             _retrieve(i, j - 1, s + sigma_skip(str2[j - 1]), S, T, str1, str2, out)
         elif S[i - 1, j] + sigma_skip(str1[i - 1]) + s >= T:
-            out.insert(0, (str1[i - 1], "-"))
+            out.insert(0, (str1[i - 1], '-'))
             _retrieve(i - 1, j, s + sigma_skip(str1[i - 1]), S, T, str1, str2, out)
         elif S[i - 1, j - 1] + sigma_sub(str1[i - 1], str2[j - 1]) + s >= T:
             out.insert(0, (str1[i - 1], str2[j - 1]))
@@ -1266,12 +1268,12 @@ def demo():
     A demonstration of the result of aligning phonetic sequences
     used in Kondrak's (2002) dissertation.
     """
-    data = [pair.split(",") for pair in cognate_data.split("\n")]
+    data = [pair.split(',') for pair in cognate_data.split('\n')]
     for pair in data:
         alignment = align(pair[0], pair[1])[0]
-        alignment = ["({}, {})".format(a[0], a[1]) for a in alignment]
-        alignment = " ".join(alignment)
-        print("{} ~ {} : {}".format(pair[0], pair[1], alignment))
+        alignment = ['({}, {})'.format(a[0], a[1]) for a in alignment]
+        alignment = ' '.join(alignment)
+        print('{} ~ {} : {}'.format(pair[0], pair[1], alignment))
 
 
 cognate_data = """jo,ʒə
@@ -1350,5 +1352,5 @@ ahkohkwa,ahkɛh
 pematesiweni,pematesewen
 asenja,aʔsɛn"""
 
-if __name__ == "__main__":
+if __name__ == '__main__':
     demo()
index c2638c8..4994f1f 100644 (file)
@@ -1,6 +1,6 @@
 # Natural Language Toolkit: Ngram Association Measures
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Joel Nothman <jnothman@student.usyd.edu.au>
 # URL: <http://nltk.org>
 # For license information, see LICENSE.TXT
@@ -11,10 +11,13 @@ generic, abstract implementation in ``NgramAssocMeasures``, and n-specific
 ``BigramAssocMeasures`` and ``TrigramAssocMeasures``.
 """
 
+from __future__ import division
+
 import math as _math
 from abc import ABCMeta, abstractmethod
 from functools import reduce
 
+from six import add_metaclass
 
 _log2 = lambda x: _math.log(x, 2.0)
 _ln = _math.log
@@ -43,7 +46,8 @@ TOTAL = -1
 """Marginals index for the number of words in the data"""
 
 
-class NgramAssocMeasures(metaclass=ABCMeta):
+@add_metaclass(ABCMeta)
+class NgramAssocMeasures(object):
     """
     An abstract class defining a collection of generic association measures.
     Each public method returns a score, taking the following arguments::
@@ -127,7 +131,7 @@ class NgramAssocMeasures(metaclass=ABCMeta):
         argument power sets an exponent (default 3) for the numerator. No
         logarithm of the result is calculated.
         """
-        return marginals[NGRAM] ** kwargs.get("power", 3) / _product(
+        return marginals[NGRAM] ** kwargs.get('power', 3) / _product(
             marginals[UNIGRAMS]
         )
 
@@ -242,7 +246,7 @@ class BigramAssocMeasures(NgramAssocMeasures):
 
         n_ii, n_io, n_oi, n_oo = cls._contingency(*marginals)
 
-        (odds, pvalue) = fisher_exact([[n_ii, n_io], [n_oi, n_oo]], alternative="less")
+        (odds, pvalue) = fisher_exact([[n_ii, n_io], [n_oi, n_oo]], alternative='less')
         return pvalue
 
     @staticmethod
@@ -438,12 +442,12 @@ class ContingencyMeasures(object):
 
     def __init__(self, measures):
         """Constructs a ContingencyMeasures given a NgramAssocMeasures class"""
-        self.__class__.__name__ = "Contingency" + measures.__class__.__name__
+        self.__class__.__name__ = 'Contingency' + measures.__class__.__name__
         for k in dir(measures):
-            if k.startswith("__"):
+            if k.startswith('__'):
                 continue
             v = getattr(measures, k)
-            if not k.startswith("_"):
+            if not k.startswith('_'):
                 v = self._make_contingency_fn(measures, v)
             setattr(self, k, v)
 
index 7d96d77..3f82f29 100644 (file)
@@ -1,14 +1,16 @@
 # Natural Language Toolkit: Confusion Matrices
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Edward Loper <edloper@gmail.com>
 #         Steven Bird <stevenbird1@gmail.com>
 # URL: <http://nltk.org/>
 # For license information, see LICENSE.TXT
-
+from __future__ import print_function, unicode_literals
 from nltk.probability import FreqDist
+from nltk.compat import python_2_unicode_compatible
 
 
+@python_2_unicode_compatible
 class ConfusionMatrix(object):
     """
     The confusion matrix between a list of reference values and a
@@ -42,7 +44,7 @@ class ConfusionMatrix(object):
             the same length.
         """
         if len(reference) != len(test):
-            raise ValueError("Lists must have the same length.")
+            raise ValueError('Lists must have the same length.')
 
         # Get a list of all values.
         if sort_by_count:
@@ -91,7 +93,7 @@ class ConfusionMatrix(object):
         return self._confusion[i][j]
 
     def __repr__(self):
-        return "<ConfusionMatrix: %s/%s correct>" % (self._correct, self._total)
+        return '<ConfusionMatrix: %s/%s correct>' % (self._correct, self._total)
 
     def __str__(self):
         return self.pretty_format()
@@ -135,30 +137,30 @@ class ConfusionMatrix(object):
 
         # Construct a format string for row values
         valuelen = max(len(val) for val in value_strings)
-        value_format = "%" + repr(valuelen) + "s | "
+        value_format = '%' + repr(valuelen) + 's | '
         # Construct a format string for matrix entries
         if show_percents:
             entrylen = 6
-            entry_format = "%5.1f%%"
-            zerostr = "     ."
+            entry_format = '%5.1f%%'
+            zerostr = '     .'
         else:
             entrylen = len(repr(self._max_conf))
-            entry_format = "%" + repr(entrylen) + "d"
-            zerostr = " " * (entrylen - 1) + "."
+            entry_format = '%' + repr(entrylen) + 'd'
+            zerostr = ' ' * (entrylen - 1) + '.'
 
         # Write the column values.
-        s = ""
+        s = ''
         for i in range(valuelen):
-            s += (" " * valuelen) + " |"
+            s += (' ' * valuelen) + ' |'
             for val in value_strings:
                 if i >= valuelen - len(val):
                     s += val[i - valuelen + len(val)].rjust(entrylen + 1)
                 else:
-                    s += " " * (entrylen + 1)
-            s += " |\n"
+                    s += ' ' * (entrylen + 1)
+            s += ' |\n'
 
         # Write a dividing line
-        s += "%s-+-%s+\n" % ("-" * valuelen, "-" * ((entrylen + 1) * len(values)))
+        s += '%s-+-%s+\n' % ('-' * valuelen, '-' * ((entrylen + 1) * len(values)))
 
         # Write the entries.
         for val, li in zip(value_strings, values):
@@ -173,29 +175,29 @@ class ConfusionMatrix(object):
                 else:
                     s += entry_format % confusion[i][j]
                 if i == j:
-                    prevspace = s.rfind(" ")
-                    s = s[:prevspace] + "<" + s[prevspace + 1 :] + ">"
+                    prevspace = s.rfind(' ')
+                    s = s[:prevspace] + '<' + s[prevspace + 1 :] + '>'
                 else:
-                    s += " "
-            s += "|\n"
+                    s += ' '
+            s += '|\n'
 
         # Write a dividing line
-        s += "%s-+-%s+\n" % ("-" * valuelen, "-" * ((entrylen + 1) * len(values)))
+        s += '%s-+-%s+\n' % ('-' * valuelen, '-' * ((entrylen + 1) * len(values)))
 
         # Write a key
-        s += "(row = reference; col = test)\n"
+        s += '(row = reference; col = test)\n'
         if not values_in_chart:
-            s += "Value key:\n"
+            s += 'Value key:\n'
             for i, value in enumerate(values):
-                s += "%6d: %s\n" % (i + 1, value)
+                s += '%6d: %s\n' % (i + 1, value)
 
         return s
 
     def key(self):
         values = self._values
-        str = "Value key:\n"
+        str = 'Value key:\n'
         indexlen = len(repr(len(values) - 1))
-        key_format = "  %" + repr(indexlen) + "d: %s\n"
+        key_format = '  %' + repr(indexlen) + 'd: %s\n'
         for i in range(len(values)):
             str += key_format % (i, values[i])
 
@@ -203,14 +205,14 @@ class ConfusionMatrix(object):
 
 
 def demo():
-    reference = "DET NN VB DET JJ NN NN IN DET NN".split()
-    test = "DET VB VB DET NN NN NN IN DET NN".split()
-    print("Reference =", reference)
-    print("Test    =", test)
-    print("Confusion matrix:")
+    reference = 'DET NN VB DET JJ NN NN IN DET NN'.split()
+    test = 'DET VB VB DET NN NN NN IN DET NN'.split()
+    print('Reference =', reference)
+    print('Test    =', test)
+    print('Confusion matrix:')
     print(ConfusionMatrix(reference, test))
     print(ConfusionMatrix(reference, test).pretty_format(sort_by_count=True))
 
 
-if __name__ == "__main__":
+if __name__ == '__main__':
     demo()
index ae988ab..e295afb 100644 (file)
@@ -1,7 +1,7 @@
 # -*- coding: utf-8 -*-
 # Natural Language Toolkit: Distance Metrics
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Edward Loper <edloper@gmail.com>
 #         Steven Bird <stevenbird1@gmail.com>
 #         Tom Lippincott <tom@cs.columbia.edu>
@@ -20,8 +20,10 @@ As metrics, they must satisfy the following three requirements:
 3. d(a, c) <= d(a, b) + d(b, c)
 """
 
+from __future__ import print_function
+from __future__ import division
+
 import warnings
-import operator
 
 
 def _edit_dist_init(len1, len2):
@@ -101,77 +103,6 @@ def edit_distance(s1, s2, substitution_cost=1, transpositions=False):
     return lev[len1][len2]
 
 
-def _edit_dist_backtrace(lev):
-    i, j = len(lev) - 1, len(lev[0]) - 1
-    alignment = [(i, j)]
-
-    while (i, j) != (0, 0):
-        directions = [
-            (i - 1, j),  # skip s1
-            (i, j - 1),  # skip s2
-            (i - 1, j - 1),  # substitution
-        ]
-
-        direction_costs = (
-            (lev[i][j] if (i >= 0 and j >= 0) else float("inf"), (i, j))
-            for i, j in directions
-        )
-        _, (i, j) = min(direction_costs, key=operator.itemgetter(0))
-
-        alignment.append((i, j))
-    return list(reversed(alignment))
-
-
-def edit_distance_align(s1, s2, substitution_cost=1):
-    """
-    Calculate the minimum Levenshtein edit-distance based alignment
-    mapping between two strings. The alignment finds the mapping
-    from string s1 to s2 that minimizes the edit distance cost.
-    For example, mapping "rain" to "shine" would involve 2
-    substitutions, 2 matches and an insertion resulting in
-    the following mapping:
-    [(0, 0), (1, 1), (2, 2), (3, 3), (4, 4), (4, 5)]
-    NB: (0, 0) is the start state without any letters associated
-    See more: https://web.stanford.edu/class/cs124/lec/med.pdf
-
-    In case of multiple valid minimum-distance alignments, the
-    backtrace has the following operation precedence:
-    1. Skip s1 character
-    2. Skip s2 character
-    3. Substitute s1 and s2 characters
-    The backtrace is carried out in reverse string order.
-
-    This function does not support transposition.
-
-    :param s1, s2: The strings to be aligned
-    :type s1: str
-    :type s2: str
-    :type substitution_cost: int
-    :rtype List[Tuple(int, int)]
-    """
-    # set up a 2-D array
-    len1 = len(s1)
-    len2 = len(s2)
-    lev = _edit_dist_init(len1 + 1, len2 + 1)
-
-    # iterate over the array
-    for i in range(len1):
-        for j in range(len2):
-            _edit_dist_step(
-                lev,
-                i + 1,
-                j + 1,
-                s1,
-                s2,
-                substitution_cost=substitution_cost,
-                transpositions=False,
-            )
-
-    # backtrace to find alignment
-    alignment = _edit_dist_backtrace(lev)
-    return alignment
-
-
 def binary_distance(label1, label2):
     """Simple equality test.
 
@@ -261,7 +192,7 @@ def fractional_presence(label):
 
 def custom_distance(file):
     data = {}
-    with open(file, "r") as infile:
+    with open(file, 'r') as infile:
         for l in infile:
             labelA, labelB, dist = l.strip().split("\t")
             labelA = frozenset([labelA])
@@ -482,5 +413,5 @@ def demo():
     print("MASI distance:", masi_distance(s1, s2))
 
 
-if __name__ == "__main__":
+if __name__ == '__main__':
     demo()
index 46e8fce..b26069b 100644 (file)
@@ -1,6 +1,6 @@
 # Natural Language Toolkit: Agreement Metrics
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Lauri Hallila <laurihallila@gmail.com>
 # URL: <http://nltk.org/>
 # For license information, see LICENSE.TXT
@@ -24,7 +24,7 @@ from math import sqrt
 
 
 def get_words_from_dictionary(lemmas):
-    """
+    '''
     Get original set of words used for analysis.
 
     :param lemmas: A dictionary where keys are lemmas and values are sets
@@ -32,7 +32,7 @@ def get_words_from_dictionary(lemmas):
     :type lemmas: dict(str): list(str)
     :return: Set of words that exist as values in the dictionary
     :rtype: set(str)
-    """
+    '''
     words = set()
     for lemma in lemmas:
         words.update(set(lemmas[lemma]))
@@ -40,7 +40,7 @@ def get_words_from_dictionary(lemmas):
 
 
 def _truncate(words, cutlength):
-    """Group words by stems defined by truncating them at given length.
+    '''Group words by stems defined by truncating them at given length.
 
     :param words: Set of words used for analysis
     :param cutlength: Words are stemmed by cutting at this length.
@@ -49,7 +49,7 @@ def _truncate(words, cutlength):
     :return: Dictionary where keys are stems and values are sets of words
     corresponding to that stem.
     :rtype: dict(str): set(str)
-    """
+    '''
     stems = {}
     for word in words:
         stem = word[:cutlength]
@@ -62,7 +62,7 @@ def _truncate(words, cutlength):
 
 # Reference: http://en.wikipedia.org/wiki/Line-line_intersection
 def _count_intersection(l1, l2):
-    """Count intersection between two line segments defined by coordinate pairs.
+    '''Count intersection between two line segments defined by coordinate pairs.
 
     :param l1: Tuple of two coordinate pairs defining the first line segment
     :param l2: Tuple of two coordinate pairs defining the second line segment
@@ -70,7 +70,7 @@ def _count_intersection(l1, l2):
     :type l2: tuple(float, float)
     :return: Coordinates of the intersection
     :rtype: tuple(float, float)
-    """
+    '''
     x1, y1 = l1[0]
     x2, y2 = l1[1]
     x3, y3 = l2[0]
@@ -97,21 +97,21 @@ def _count_intersection(l1, l2):
 
 
 def _get_derivative(coordinates):
-    """Get derivative of the line from (0,0) to given coordinates.
+    '''Get derivative of the line from (0,0) to given coordinates.
 
     :param coordinates: A coordinate pair
     :type coordinates: tuple(float, float)
     :return: Derivative; inf if x is zero
     :rtype: float
-    """
+    '''
     try:
         return coordinates[1] / coordinates[0]
     except ZeroDivisionError:
-        return float("inf")
+        return float('inf')
 
 
 def _calculate_cut(lemmawords, stems):
-    """Count understemmed and overstemmed pairs for (lemma, stem) pair with common words.
+    '''Count understemmed and overstemmed pairs for (lemma, stem) pair with common words.
 
     :param lemmawords: Set or list of words corresponding to certain lemma.
     :param stems: A dictionary where keys are stems and values are sets
@@ -121,7 +121,7 @@ def _calculate_cut(lemmawords, stems):
     :return: Amount of understemmed and overstemmed pairs contributed by words
     existing in both lemmawords and stems.
     :rtype: tuple(float, float)
-    """
+    '''
     umt, wmt = 0.0, 0.0
     for stem in stems:
         cut = set(lemmawords) & set(stems[stem])
@@ -136,7 +136,7 @@ def _calculate_cut(lemmawords, stems):
 
 
 def _calculate(lemmas, stems):
-    """Calculate actual and maximum possible amounts of understemmed and overstemmed word pairs.
+    '''Calculate actual and maximum possible amounts of understemmed and overstemmed word pairs.
 
     :param lemmas: A dictionary where keys are lemmas and values are sets
     or lists of words corresponding to that lemma.
@@ -149,7 +149,7 @@ def _calculate(lemmas, stems):
     global wrongly merged total (gwmt) and
     global desired non-merge total (gdnt).
     :rtype: tuple(float, float, float, float)
-    """
+    '''
 
     n = sum(len(lemmas[word]) for word in lemmas)
 
@@ -177,7 +177,7 @@ def _calculate(lemmas, stems):
 
 
 def _indexes(gumt, gdmt, gwmt, gdnt):
-    """Count Understemming Index (UI), Overstemming Index (OI) and Stemming Weight (SW).
+    '''Count Understemming Index (UI), Overstemming Index (OI) and Stemming Weight (SW).
 
     :param gumt, gdmt, gwmt, gdnt: Global unachieved merge total (gumt),
     global desired merge total (gdmt),
@@ -188,7 +188,7 @@ def _indexes(gumt, gdmt, gwmt, gdnt):
     Overstemming Index (OI) and
     Stemming Weight (SW).
     :rtype: tuple(float, float, float)
-    """
+    '''
     # Calculate Understemming Index (UI),
     # Overstemming Index (OI) and Stemming Weight (SW)
     try:
@@ -206,25 +206,25 @@ def _indexes(gumt, gdmt, gwmt, gdnt):
     except ZeroDivisionError:
         if oi == 0.0:
             # OI and UI are 0, define SW as 'not a number'
-            sw = float("nan")
+            sw = float('nan')
         else:
             # UI is 0, define SW as infinity
-            sw = float("inf")
+            sw = float('inf')
     return (ui, oi, sw)
 
 
 class Paice(object):
-    """Class for storing lemmas, stems and evaluation metrics."""
+    '''Class for storing lemmas, stems and evaluation metrics.'''
 
     def __init__(self, lemmas, stems):
-        """
+        '''
         :param lemmas: A dictionary where keys are lemmas and values are sets
         or lists of words corresponding to that lemma.
         :param stems: A dictionary where keys are stems and values are sets
         or lists of words corresponding to that stem.
         :type lemmas: dict(str): list(str)
         :type stems: dict(str): set(str)
-        """
+        '''
         self.lemmas = lemmas
         self.stems = stems
         self.coords = []
@@ -234,20 +234,20 @@ class Paice(object):
         self.update()
 
     def __str__(self):
-        text = ["Global Unachieved Merge Total (GUMT): %s\n" % self.gumt]
-        text.append("Global Desired Merge Total (GDMT): %s\n" % self.gdmt)
-        text.append("Global Wrongly-Merged Total (GWMT): %s\n" % self.gwmt)
-        text.append("Global Desired Non-merge Total (GDNT): %s\n" % self.gdnt)
-        text.append("Understemming Index (GUMT / GDMT): %s\n" % self.ui)
-        text.append("Overstemming Index (GWMT / GDNT): %s\n" % self.oi)
-        text.append("Stemming Weight (OI / UI): %s\n" % self.sw)
-        text.append("Error-Rate Relative to Truncation (ERRT): %s\r\n" % self.errt)
-        coordinates = " ".join(["(%s, %s)" % item for item in self.coords])
-        text.append("Truncation line: %s" % coordinates)
-        return "".join(text)
+        text = ['Global Unachieved Merge Total (GUMT): %s\n' % self.gumt]
+        text.append('Global Desired Merge Total (GDMT): %s\n' % self.gdmt)
+        text.append('Global Wrongly-Merged Total (GWMT): %s\n' % self.gwmt)
+        text.append('Global Desired Non-merge Total (GDNT): %s\n' % self.gdnt)
+        text.append('Understemming Index (GUMT / GDMT): %s\n' % self.ui)
+        text.append('Overstemming Index (GWMT / GDNT): %s\n' % self.oi)
+        text.append('Stemming Weight (OI / UI): %s\n' % self.sw)
+        text.append('Error-Rate Relative to Truncation (ERRT): %s\r\n' % self.errt)
+        coordinates = ' '.join(['(%s, %s)' % item for item in self.coords])
+        text.append('Truncation line: %s' % coordinates)
+        return ''.join(text)
 
     def _get_truncation_indexes(self, words, cutlength):
-        """Count (UI, OI) when stemming is done by truncating words at \'cutlength\'.
+        '''Count (UI, OI) when stemming is done by truncating words at \'cutlength\'.
 
         :param words: Words used for the analysis
         :param cutlength: Words are stemmed by cutting them at this length
@@ -255,7 +255,7 @@ class Paice(object):
         :type cutlength: int
         :return: Understemming and overstemming indexes
         :rtype: tuple(int, int)
-        """
+        '''
 
         truncated = _truncate(words, cutlength)
         gumt, gdmt, gwmt, gdnt = _calculate(self.lemmas, truncated)
@@ -263,7 +263,7 @@ class Paice(object):
         return (ui, oi)
 
     def _get_truncation_coordinates(self, cutlength=0):
-        """Count (UI, OI) pairs for truncation points until we find the segment where (ui, oi) crosses the truncation line.
+        '''Count (UI, OI) pairs for truncation points until we find the segment where (ui, oi) crosses the truncation line.
 
         :param cutlength: Optional parameter to start counting from (ui, oi)
         coordinates gotten by stemming at this length. Useful for speeding up
@@ -272,7 +272,7 @@ class Paice(object):
         :type cutlength: int
         :return: List of coordinate pairs that define the truncation line
         :rtype: list(tuple(float, float))
-        """
+        '''
         words = get_words_from_dictionary(self.lemmas)
         maxlength = max(len(word) for word in words)
 
@@ -302,21 +302,21 @@ class Paice(object):
         return coords
 
     def _errt(self):
-        """Count Error-Rate Relative to Truncation (ERRT).
+        '''Count Error-Rate Relative to Truncation (ERRT).
 
         :return: ERRT, length of the line from origo to (UI, OI) divided by
         the length of the line from origo to the point defined by the same
         line when extended until the truncation line.
         :rtype: float
-        """
+        '''
         # Count (UI, OI) pairs for truncation points until we find the segment where (ui, oi) crosses the truncation line
         self.coords = self._get_truncation_coordinates()
         if (0.0, 0.0) in self.coords:
             # Truncation line goes through origo, so ERRT cannot be counted
             if (self.ui, self.oi) != (0.0, 0.0):
-                return float("inf")
+                return float('inf')
             else:
-                return float("nan")
+                return float('nan')
         if (self.ui, self.oi) == (0.0, 0.0):
             # (ui, oi) is origo; define errt as 0.0
             return 0.0
@@ -334,56 +334,56 @@ class Paice(object):
         return op / ot
 
     def update(self):
-        """Update statistics after lemmas and stems have been set."""
+        '''Update statistics after lemmas and stems have been set.'''
         self.gumt, self.gdmt, self.gwmt, self.gdnt = _calculate(self.lemmas, self.stems)
         self.ui, self.oi, self.sw = _indexes(self.gumt, self.gdmt, self.gwmt, self.gdnt)
         self.errt = self._errt()
 
 
 def demo():
-    """Demonstration of the module."""
+    '''Demonstration of the module.'''
     # Some words with their real lemmas
     lemmas = {
-        "kneel": ["kneel", "knelt"],
-        "range": ["range", "ranged"],
-        "ring": ["ring", "rang", "rung"],
+        'kneel': ['kneel', 'knelt'],
+        'range': ['range', 'ranged'],
+        'ring': ['ring', 'rang', 'rung'],
     }
     # Same words with stems from a stemming algorithm
     stems = {
-        "kneel": ["kneel"],
-        "knelt": ["knelt"],
-        "rang": ["rang", "range", "ranged"],
-        "ring": ["ring"],
-        "rung": ["rung"],
+        'kneel': ['kneel'],
+        'knelt': ['knelt'],
+        'rang': ['rang', 'range', 'ranged'],
+        'ring': ['ring'],
+        'rung': ['rung'],
     }
-    print("Words grouped by their lemmas:")
+    print('Words grouped by their lemmas:')
     for lemma in sorted(lemmas):
-        print("%s => %s" % (lemma, " ".join(lemmas[lemma])))
+        print('%s => %s' % (lemma, ' '.join(lemmas[lemma])))
     print()
-    print("Same words grouped by a stemming algorithm:")
+    print('Same words grouped by a stemming algorithm:')
     for stem in sorted(stems):
-        print("%s => %s" % (stem, " ".join(stems[stem])))
+        print('%s => %s' % (stem, ' '.join(stems[stem])))
     print()
     p = Paice(lemmas, stems)
     print(p)
     print()
     # Let's "change" results from a stemming algorithm
     stems = {
-        "kneel": ["kneel"],
-        "knelt": ["knelt"],
-        "rang": ["rang"],
-        "range": ["range", "ranged"],
-        "ring": ["ring"],
-        "rung": ["rung"],
+        'kneel': ['kneel'],
+        'knelt': ['knelt'],
+        'rang': ['rang'],
+        'range': ['range', 'ranged'],
+        'ring': ['ring'],
+        'rung': ['rung'],
     }
-    print("Counting stats after changing stemming results:")
+    print('Counting stats after changing stemming results:')
     for stem in sorted(stems):
-        print("%s => %s" % (stem, " ".join(stems[stem])))
+        print('%s => %s' % (stem, ' '.join(stems[stem])))
     print()
     p.stems = stems
     p.update()
     print(p)
 
 
-if __name__ == "__main__":
+if __name__ == '__main__':
     demo()
index b5156ed..9e6a516 100644 (file)
@@ -1,16 +1,19 @@
 # Natural Language Toolkit: Evaluation
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Edward Loper <edloper@gmail.com>
 #         Steven Bird <stevenbird1@gmail.com>
 # URL: <http://nltk.org/>
 # For license information, see LICENSE.TXT
+from __future__ import print_function, division
 
 from math import fabs
 import operator
 from random import shuffle
 from functools import reduce
 
+from six.moves import range, zip
+
 try:
     from scipy.stats.stats import betai
 except ImportError:
@@ -52,8 +55,8 @@ def precision(reference, test):
     :param test: A set of values to compare against the reference set.
     :rtype: float or None
     """
-    if not hasattr(reference, "intersection") or not hasattr(test, "intersection"):
-        raise TypeError("reference and test should be sets")
+    if not hasattr(reference, 'intersection') or not hasattr(test, 'intersection'):
+        raise TypeError('reference and test should be sets')
 
     if len(test) == 0:
         return None
@@ -74,8 +77,8 @@ def recall(reference, test):
     :param test: A set of values to compare against the reference set.
     :rtype: float or None
     """
-    if not hasattr(reference, "intersection") or not hasattr(test, "intersection"):
-        raise TypeError("reference and test should be sets")
+    if not hasattr(reference, 'intersection') or not hasattr(test, 'intersection'):
+        raise TypeError('reference and test should be sets')
 
     if len(reference) == 0:
         return None
@@ -157,20 +160,20 @@ def approxrand(a, b, **kwargs):
     :param b: another list of independently generated test values
     :type b: list
     """
-    shuffles = kwargs.get("shuffles", 999)
+    shuffles = kwargs.get('shuffles', 999)
     # there's no point in trying to shuffle beyond all possible permutations
     shuffles = min(shuffles, reduce(operator.mul, range(1, len(a) + len(b) + 1)))
-    stat = kwargs.get("statistic", lambda lst: sum(lst) / len(lst))
-    verbose = kwargs.get("verbose", False)
+    stat = kwargs.get('statistic', lambda lst: sum(lst) / len(lst))
+    verbose = kwargs.get('verbose', False)
 
     if verbose:
-        print("shuffles: %d" % shuffles)
+        print('shuffles: %d' % shuffles)
 
     actual_stat = fabs(stat(a) - stat(b))
 
     if verbose:
-        print("actual statistic: %f" % actual_stat)
-        print("-" * 60)
+        print('actual statistic: %f' % actual_stat)
+        print('-' * 60)
 
     c = 1e-100
     lst = LazyConcatenation([a, b])
@@ -178,7 +181,7 @@ def approxrand(a, b, **kwargs):
 
     for i in range(shuffles):
         if verbose and i % 10 == 0:
-            print("shuffle: %d" % i)
+            print('shuffle: %d' % i)
 
         shuffle(indices)
 
@@ -190,14 +193,14 @@ def approxrand(a, b, **kwargs):
             c += 1
 
         if verbose and i % 10 == 0:
-            print("pseudo-statistic: %f" % pseudo_stat)
-            print("significance: %f" % ((c + 1) / (i + 1)))
-            print("-" * 60)
+            print('pseudo-statistic: %f' % pseudo_stat)
+            print('significance: %f' % ((c + 1) / (i + 1)))
+            print('-' * 60)
 
     significance = (c + 1) / (shuffles + 1)
 
     if verbose:
-        print("significance: %f" % significance)
+        print('significance: %f' % significance)
         if betai:
             for phi in [0.01, 0.05, 0.10, 0.15, 0.25, 0.50]:
                 print("prob(phi<=%f): %f" % (phi, betai(c, shuffles, phi)))
@@ -206,23 +209,23 @@ def approxrand(a, b, **kwargs):
 
 
 def demo():
-    print("-" * 75)
-    reference = "DET NN VB DET JJ NN NN IN DET NN".split()
-    test = "DET VB VB DET NN NN NN IN DET NN".split()
-    print("Reference =", reference)
-    print("Test    =", test)
-    print("Accuracy:", accuracy(reference, test))
-
-    print("-" * 75)
+    print('-' * 75)
+    reference = 'DET NN VB DET JJ NN NN IN DET NN'.split()
+    test = 'DET VB VB DET NN NN NN IN DET NN'.split()
+    print('Reference =', reference)
+    print('Test    =', test)
+    print('Accuracy:', accuracy(reference, test))
+
+    print('-' * 75)
     reference_set = set(reference)
     test_set = set(test)
-    print("Reference =", reference_set)
-    print("Test =   ", test_set)
-    print("Precision:", precision(reference_set, test_set))
-    print("   Recall:", recall(reference_set, test_set))
-    print("F-Measure:", f_measure(reference_set, test_set))
-    print("-" * 75)
+    print('Reference =', reference_set)
+    print('Test =   ', test_set)
+    print('Precision:', precision(reference_set, test_set))
+    print('   Recall:', recall(reference_set, test_set))
+    print('F-Measure:', f_measure(reference_set, test_set))
+    print('-' * 75)
 
 
-if __name__ == "__main__":
+if __name__ == '__main__':
     demo()
index 412e00d..9a96c15 100644 (file)
@@ -1,6 +1,6 @@
 # Natural Language Toolkit: Text Segmentation Metrics
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Edward Loper <edloper@gmail.com>
 #         Steven Bird <stevenbird1@gmail.com>
 #         David Doukhan <david.doukhan@gmail.com>
@@ -45,6 +45,8 @@ try:
 except ImportError:
     pass
 
+from six.moves import range
+
 
 def windowdiff(seg1, seg2, k, boundary="1", weighted=False):
     """
@@ -118,7 +120,7 @@ def _ghd_aux(mat, rowv, colv, ins_cost, del_cost, shift_cost_coeff):
             mat[i + 1, j + 1] = min(tcost, shift_cost)
 
 
-def ghd(ref, hyp, ins_cost=2.0, del_cost=2.0, shift_cost_coeff=1.0, boundary="1"):
+def ghd(ref, hyp, ins_cost=2.0, del_cost=2.0, shift_cost_coeff=1.0, boundary='1'):
     """
     Compute the Generalized Hamming Distance for a reference and a hypothetical
     segmentation, corresponding to the cost related to the transformation
@@ -185,7 +187,7 @@ def ghd(ref, hyp, ins_cost=2.0, del_cost=2.0, shift_cost_coeff=1.0, boundary="1"
 # Beeferman's Pk text segmentation evaluation metric
 
 
-def pk(ref, hyp, k=None, boundary="1"):
+def pk(ref, hyp, k=None, boundary='1'):
     """
     Compute the Pk metric for a pair of segmentations A segmentation
     is any sequence over a vocabulary of two items (e.g. "0", "1"),
index a6d17db..3736b8f 100644 (file)
@@ -1,9 +1,10 @@
 # Natural Language Toolkit: Spearman Rank Correlation
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Joel Nothman <jnothman@student.usyd.edu.au>
 # URL: <http://nltk.org>
 # For license information, see LICENSE.TXT
+from __future__ import division
 
 """
 Tools for comparing ranked lists.
index ab1f761..63c1da9 100644 (file)
@@ -1,6 +1,6 @@
 # Natural Language Toolkit: Miscellaneous modules
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Steven Bird <stevenbird1@gmail.com>
 # URL: <http://nltk.org/>
 # For license information, see LICENSE.TXT
index 8b0d95b..c5604f0 100644 (file)
Binary files a/nlp_resource_data/nltk/misc/__pycache__/__init__.cpython-37.pyc and b/nlp_resource_data/nltk/misc/__pycache__/__init__.cpython-37.pyc differ
index 8623bb4..93b594b 100644 (file)
Binary files a/nlp_resource_data/nltk/misc/__pycache__/babelfish.cpython-37.pyc and b/nlp_resource_data/nltk/misc/__pycache__/babelfish.cpython-37.pyc differ
index 9de1f5d..3dc810d 100644 (file)
Binary files a/nlp_resource_data/nltk/misc/__pycache__/chomsky.cpython-37.pyc and b/nlp_resource_data/nltk/misc/__pycache__/chomsky.cpython-37.pyc differ
index 919b3eb..cd748bc 100644 (file)
Binary files a/nlp_resource_data/nltk/misc/__pycache__/minimalset.cpython-37.pyc and b/nlp_resource_data/nltk/misc/__pycache__/minimalset.cpython-37.pyc differ
index bb9d353..a68ae6d 100644 (file)
Binary files a/nlp_resource_data/nltk/misc/__pycache__/sort.cpython-37.pyc and b/nlp_resource_data/nltk/misc/__pycache__/sort.cpython-37.pyc differ
index 364f733..17e6e44 100644 (file)
Binary files a/nlp_resource_data/nltk/misc/__pycache__/wordfinder.cpython-37.pyc and b/nlp_resource_data/nltk/misc/__pycache__/wordfinder.cpython-37.pyc differ
index a43fd4d..fb00bf5 100644 (file)
@@ -4,6 +4,7 @@ translation service; this service is no longer available; this
 module is kept in NLTK source code in order to provide better error
 messages for people following the NLTK Book 2.0.
 """
+from __future__ import print_function
 
 
 def babelize_shell():
index 0d4b065..d910024 100644 (file)
@@ -12,6 +12,7 @@ To generate n sentences of linguistic wisdom, type
     (CHOMSKY n)  -- for example
     (CHOMSKY 5) generates half a screen of linguistic truth.
 """
+from __future__ import print_function
 
 leadins = """To characterize a linguistic level L,
     On the other hand,
@@ -118,6 +119,8 @@ scope of a complex symbol.
 import textwrap, random
 from itertools import chain, islice
 
+from six.moves import zip
+
 
 def generate_chomsky(times=5, line_length=72):
     parts = []
@@ -129,5 +132,5 @@ def generate_chomsky(times=5, line_length=72):
     print(textwrap.fill(" ".join(output), line_length))
 
 
-if __name__ == "__main__":
+if __name__ == '__main__':
     generate_chomsky()
index ea49d08..ca298e8 100644 (file)
@@ -1,6 +1,6 @@
 # Natural Language Toolkit: Minimal Sets
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Steven Bird <stevenbird1@gmail.com>
 # URL: <http://nltk.org>
 # For license information, see LICENSE.TXT
index 0dbaf99..cef988e 100644 (file)
@@ -1,6 +1,6 @@
 # Natural Language Toolkit: List Sorting
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Steven Bird <stevenbird1@gmail.com>
 # URL: <http://nltk.org/>
 # For license information, see LICENSE.TXT
@@ -10,6 +10,8 @@ This module provides a variety of list sorting algorithms, to
 illustrate the many different algorithms (recipes) for solving a
 problem, and how to analyze algorithms experimentally.
 """
+from __future__ import print_function, division
+
 # These algorithms are taken from:
 # Levitin (2004) The Design and Analysis of Algorithms
 
@@ -174,5 +176,5 @@ def demo():
         )
 
 
-if __name__ == "__main__":
+if __name__ == '__main__':
     demo()
index 4514f62..a0b8ae7 100644 (file)
@@ -1,12 +1,13 @@
 # Natural Language Toolkit: Word Finder
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Steven Bird <stevenbird1@gmail.com>
 # URL: <http://nltk.org/>
 # For license information, see LICENSE.TXT
 
 # Simplified from PHP version by Robert Klein <brathna@gmail.com>
 # http://fswordfinder.sourceforge.net/
+from __future__ import print_function
 
 import random
 
@@ -48,7 +49,7 @@ def check(word, dir, x, y, grid, rows, cols):
         return step(word, x, lambda i: x, y, lambda i: y - i, grid)
 
 
-def wordfinder(words, rows=20, cols=20, attempts=50, alph="ABCDEFGHIJKLMNOPQRSTUVWXYZ"):
+def wordfinder(words, rows=20, cols=20, attempts=50, alph='ABCDEFGHIJKLMNOPQRSTUVWXYZ'):
     """
     Attempt to arrange words into a letter-grid with the specified
     number of rows and columns.  Try each word in several positions
@@ -109,7 +110,7 @@ def wordfinder(words, rows=20, cols=20, attempts=50, alph="ABCDEFGHIJKLMNOPQRSTU
     # Fill up the remaining spaces
     for i in range(rows):
         for j in range(cols):
-            if grid[i][j] == "":
+            if grid[i][j] == '':
                 grid[i][j] = random.choice(alph)
 
     return grid, used
@@ -127,7 +128,7 @@ def word_finder():
     print("Word Finder\n")
     for i in range(len(grid)):
         for j in range(len(grid[i])):
-            print(grid[i][j], end=" ")
+            print(grid[i][j], end=' ')
         print()
     print()
 
@@ -135,5 +136,5 @@ def word_finder():
         print("%d:" % (i + 1), used[i])
 
 
-if __name__ == "__main__":
+if __name__ == '__main__':
     word_finder()
index e4f0f15..52cd4f1 100644 (file)
@@ -1,6 +1,6 @@
 # Natural Language Toolkit: Parsers
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Steven Bird <stevenbird1@gmail.com>
 #         Edward Loper <edloper@gmail.com>
 # URL: <http://nltk.org/>
index f0b4503..868c424 100644 (file)
Binary files a/nlp_resource_data/nltk/parse/__pycache__/__init__.cpython-37.pyc and b/nlp_resource_data/nltk/parse/__pycache__/__init__.cpython-37.pyc differ
index 2c6c601..7bee1ea 100644 (file)
Binary files a/nlp_resource_data/nltk/parse/__pycache__/api.cpython-37.pyc and b/nlp_resource_data/nltk/parse/__pycache__/api.cpython-37.pyc differ
index 6edee04..da0115e 100644 (file)
Binary files a/nlp_resource_data/nltk/parse/__pycache__/bllip.cpython-37.pyc and b/nlp_resource_data/nltk/parse/__pycache__/bllip.cpython-37.pyc differ
index 2aab3d0..8277e7f 100644 (file)
Binary files a/nlp_resource_data/nltk/parse/__pycache__/chart.cpython-37.pyc and b/nlp_resource_data/nltk/parse/__pycache__/chart.cpython-37.pyc differ
index f706372..994838d 100644 (file)
Binary files a/nlp_resource_data/nltk/parse/__pycache__/corenlp.cpython-37.pyc and b/nlp_resource_data/nltk/parse/__pycache__/corenlp.cpython-37.pyc differ
index a8d4eaa..788e48f 100644 (file)
Binary files a/nlp_resource_data/nltk/parse/__pycache__/dependencygraph.cpython-37.pyc and b/nlp_resource_data/nltk/parse/__pycache__/dependencygraph.cpython-37.pyc differ
index 19306a9..b385ab6 100644 (file)
Binary files a/nlp_resource_data/nltk/parse/__pycache__/earleychart.cpython-37.pyc and b/nlp_resource_data/nltk/parse/__pycache__/earleychart.cpython-37.pyc differ
index 58e00bf..baeb98a 100644 (file)
Binary files a/nlp_resource_data/nltk/parse/__pycache__/evaluate.cpython-37.pyc and b/nlp_resource_data/nltk/parse/__pycache__/evaluate.cpython-37.pyc differ
index 8cdc116..3f3c6a6 100644 (file)
Binary files a/nlp_resource_data/nltk/parse/__pycache__/featurechart.cpython-37.pyc and b/nlp_resource_data/nltk/parse/__pycache__/featurechart.cpython-37.pyc differ
index e619604..a55f72c 100644 (file)
Binary files a/nlp_resource_data/nltk/parse/__pycache__/generate.cpython-37.pyc and b/nlp_resource_data/nltk/parse/__pycache__/generate.cpython-37.pyc differ
index 6dcd31b..605b5c1 100644 (file)
Binary files a/nlp_resource_data/nltk/parse/__pycache__/malt.cpython-37.pyc and b/nlp_resource_data/nltk/parse/__pycache__/malt.cpython-37.pyc differ
index 6d3072d..7f70faa 100644 (file)
Binary files a/nlp_resource_data/nltk/parse/__pycache__/nonprojectivedependencyparser.cpython-37.pyc and b/nlp_resource_data/nltk/parse/__pycache__/nonprojectivedependencyparser.cpython-37.pyc differ
index e9a5817..ceff037 100644 (file)
Binary files a/nlp_resource_data/nltk/parse/__pycache__/pchart.cpython-37.pyc and b/nlp_resource_data/nltk/parse/__pycache__/pchart.cpython-37.pyc differ
index 08e4a80..9cc6797 100644 (file)
Binary files a/nlp_resource_data/nltk/parse/__pycache__/projectivedependencyparser.cpython-37.pyc and b/nlp_resource_data/nltk/parse/__pycache__/projectivedependencyparser.cpython-37.pyc differ
index b64a4db..3c1a499 100644 (file)
Binary files a/nlp_resource_data/nltk/parse/__pycache__/recursivedescent.cpython-37.pyc and b/nlp_resource_data/nltk/parse/__pycache__/recursivedescent.cpython-37.pyc differ
index 93f7265..d241cf5 100644 (file)
Binary files a/nlp_resource_data/nltk/parse/__pycache__/shiftreduce.cpython-37.pyc and b/nlp_resource_data/nltk/parse/__pycache__/shiftreduce.cpython-37.pyc differ
index 255f7bd..6aa9c92 100644 (file)
Binary files a/nlp_resource_data/nltk/parse/__pycache__/stanford.cpython-37.pyc and b/nlp_resource_data/nltk/parse/__pycache__/stanford.cpython-37.pyc differ
index 7b8d53b..f2a5b35 100644 (file)
Binary files a/nlp_resource_data/nltk/parse/__pycache__/transitionparser.cpython-37.pyc and b/nlp_resource_data/nltk/parse/__pycache__/transitionparser.cpython-37.pyc differ
index a5cff93..5e0696c 100644 (file)
Binary files a/nlp_resource_data/nltk/parse/__pycache__/util.cpython-37.pyc and b/nlp_resource_data/nltk/parse/__pycache__/util.cpython-37.pyc differ
index 8246ca7..3ff28cf 100644 (file)
Binary files a/nlp_resource_data/nltk/parse/__pycache__/viterbi.cpython-37.pyc and b/nlp_resource_data/nltk/parse/__pycache__/viterbi.cpython-37.pyc differ
index 2a12adc..5372b10 100644 (file)
@@ -1,6 +1,6 @@
 # Natural Language Toolkit: Parser API
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Steven Bird <stevenbird1@gmail.com>
 #         Edward Loper <edloper@gmail.com>
 # URL: <http://nltk.org/>
index 144f040..01934b8 100644 (file)
@@ -2,10 +2,12 @@
 #
 # Author: David McClosky <dmcc@bigasterisk.com>
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # URL: <http://nltk.org/>
 # For license information, see LICENSE.TXT
 
+from __future__ import print_function
+
 from nltk.parse.api import ParserI
 from nltk.tree import Tree
 
@@ -79,7 +81,7 @@ See http://pypi.python.org/pypi/bllipparser/ for more information
 on BLLIP Parser's Python interface.
 """
 
-__all__ = ["BllipParser"]
+__all__ = ['BllipParser']
 
 # this block allows this module to be imported even if bllipparser isn't
 # available
@@ -100,7 +102,7 @@ except ImportError as ie:
 def _ensure_ascii(words):
     try:
         for i, word in enumerate(words):
-            word.decode("ascii")
+            word.decode('ascii')
     except UnicodeDecodeError:
         raise ValueError(
             "Token %d (%r) is non-ASCII. BLLIP Parser "
@@ -260,19 +262,19 @@ def demo():
 
     from nltk.data import find
 
-    model_dir = find("models/bllip_wsj_no_aux").path
+    model_dir = find('models/bllip_wsj_no_aux').path
 
-    print("Loading BLLIP Parsing models...")
+    print('Loading BLLIP Parsing models...')
     # the easiest way to get started is to use a unified model
     bllip = BllipParser.from_unified_model_dir(model_dir)
-    print("Done.")
+    print('Done.')
 
-    sentence1 = "British left waffles on Falklands .".split()
-    sentence2 = "I saw the man with the telescope .".split()
+    sentence1 = 'British left waffles on Falklands .'.split()
+    sentence2 = 'I saw the man with the telescope .'.split()
     # this sentence is known to fail under the WSJ parsing model
-    fail1 = "# ! ? : -".split()
+    fail1 = '# ! ? : -'.split()
     for sentence in (sentence1, sentence2, fail1):
-        print("Sentence: %r" % " ".join(sentence))
+        print('Sentence: %r' % ' '.join(sentence))
         try:
             tree = next(bllip.parse(sentence))
             print(tree)
@@ -281,22 +283,22 @@ def demo():
 
     # n-best parsing demo
     for i, parse in enumerate(bllip.parse(sentence1)):
-        print("parse %d:\n%s" % (i, parse))
+        print('parse %d:\n%s' % (i, parse))
 
     # using external POS tag constraints
     print(
         "forcing 'tree' to be 'NN':",
-        next(bllip.tagged_parse([("A", None), ("tree", "NN")])),
+        next(bllip.tagged_parse([('A', None), ('tree', 'NN')])),
     )
     print(
         "forcing 'A' to be 'DT' and 'tree' to be 'NNP':",
-        next(bllip.tagged_parse([("A", "DT"), ("tree", "NNP")])),
+        next(bllip.tagged_parse([('A', 'DT'), ('tree', 'NNP')])),
     )
     # constraints don't have to make sense... (though on more complicated
     # sentences, they may cause the parse to fail)
     print(
         "forcing 'A' to be 'NNP':",
-        next(bllip.tagged_parse([("A", "NNP"), ("tree", None)])),
+        next(bllip.tagged_parse([('A', 'NNP'), ('tree', None)])),
     )
 
 
@@ -307,6 +309,6 @@ def setup_module(module):
         _ensure_bllip_import_or_error()
     except ImportError:
         raise SkipTest(
-            "doctests from nltk.parse.bllip are skipped because "
-            "the bllipparser module is not installed"
+            'doctests from nltk.parse.bllip are skipped because '
+            'the bllipparser module is not installed'
         )
index dffd644..f1f68b4 100644 (file)
@@ -1,7 +1,7 @@
 # -*- coding: utf-8 -*-
 # Natural Language Toolkit: A Chart Parser
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Edward Loper <edloper@gmail.com>
 #         Steven Bird <stevenbird1@gmail.com>
 #         Jean Mark Gawron <gawron@mail.sdsu.edu>
@@ -35,16 +35,20 @@ defines three chart parsers:
   - ``SteppingChartParser`` is a subclass of ``ChartParser`` that can
     be used to step through the parsing process.
 """
+from __future__ import print_function, division, unicode_literals
 
 import itertools
 import re
 import warnings
 from functools import total_ordering
 
+from six.moves import range
+
 from nltk.tree import Tree
 from nltk.grammar import PCFG, is_nonterminal, is_terminal
 from nltk.util import OrderedDict
 from nltk.internals import raise_unorderable_types
+from nltk.compat import python_2_unicode_compatible, unicode_repr
 
 from nltk.parse.api import ParserI
 
@@ -90,7 +94,7 @@ class EdgeI(object):
 
     def __init__(self):
         if self.__class__ == EdgeI:
-            raise TypeError("Edge is an abstract interface")
+            raise TypeError('Edge is an abstract interface')
 
     # ////////////////////////////////////////////////////////////
     # Span
@@ -225,6 +229,7 @@ class EdgeI(object):
             return self._hash
 
 
+@python_2_unicode_compatible
 class TreeEdge(EdgeI):
     """
     An edge that records the fact that a tree is (partially)
@@ -345,21 +350,22 @@ class TreeEdge(EdgeI):
 
     # String representation
     def __str__(self):
-        str = "[%s:%s] " % (self._span[0], self._span[1])
-        str += "%-2r ->" % (self._lhs,)
+        str = '[%s:%s] ' % (self._span[0], self._span[1])
+        str += '%-2r ->' % (self._lhs,)
 
         for i in range(len(self._rhs)):
             if i == self._dot:
-                str += " *"
-            str += " %s" % repr(self._rhs[i])
+                str += ' *'
+            str += ' %s' % unicode_repr(self._rhs[i])
         if len(self._rhs) == self._dot:
-            str += " *"
+            str += ' *'
         return str
 
     def __repr__(self):
-        return "[Edge: %s]" % self
+        return '[Edge: %s]' % self
 
 
+@python_2_unicode_compatible
 class LeafEdge(EdgeI):
     """
     An edge that records the fact that a leaf value is consistent with
@@ -419,10 +425,10 @@ class LeafEdge(EdgeI):
 
     # String representations
     def __str__(self):
-        return "[%s:%s] %s" % (self._index, self._index + 1, repr(self._leaf))
+        return '[%s:%s] %s' % (self._index, self._index + 1, unicode_repr(self._leaf))
 
     def __repr__(self):
-        return "[Edge: %s]" % (self)
+        return '[Edge: %s]' % (self)
 
 
 ########################################################################
@@ -600,7 +606,7 @@ class Chart(object):
         # Make sure it's a valid index.
         for key in restr_keys:
             if not hasattr(EdgeI, key):
-                raise ValueError("Bad restriction: %s" % key)
+                raise ValueError('Bad restriction: %s' % key)
 
         # Create the index.
         index = self._indexes[restr_keys] = {}
@@ -778,26 +784,26 @@ class Chart(object):
             width = 50 // (self.num_leaves() + 1)
         (start, end) = (edge.start(), edge.end())
 
-        str = "|" + ("." + " " * (width - 1)) * start
+        str = '|' + ('.' + ' ' * (width - 1)) * start
 
         # Zero-width edges are "#" if complete, ">" if incomplete
         if start == end:
             if edge.is_complete():
-                str += "#"
+                str += '#'
             else:
-                str += ">"
+                str += '>'
 
         # Spanning complete edges are "[===]"; Other edges are
         # "[---]" if complete, "[--->" if incomplete
         elif edge.is_complete() and edge.span() == (0, self._num_leaves):
-            str += "[" + ("=" * width) * (end - start - 1) + "=" * (width - 1) + "]"
+            str += '[' + ('=' * width) * (end - start - 1) + '=' * (width - 1) + ']'
         elif edge.is_complete():
-            str += "[" + ("-" * width) * (end - start - 1) + "-" * (width - 1) + "]"
+            str += '[' + ('-' * width) * (end - start - 1) + '-' * (width - 1) + ']'
         else:
-            str += "[" + ("-" * width) * (end - start - 1) + "-" * (width - 1) + ">"
+            str += '[' + ('-' * width) * (end - start - 1) + '-' * (width - 1) + '>'
 
-        str += (" " * (width - 1) + ".") * (self._num_leaves - end)
-        return str + "| %s" % edge
+        str += (' ' * (width - 1) + '.') * (self._num_leaves - end)
+        return str + '| %s' % edge
 
     def pretty_format_leaves(self, width=None):
         """
@@ -809,12 +815,12 @@ class Chart(object):
             width = 50 // (self.num_leaves() + 1)
 
         if self._tokens is not None and width > 1:
-            header = "|."
+            header = '|.'
             for tok in self._tokens:
-                header += tok[: width - 1].center(width - 1) + "."
-            header += "|"
+                header += tok[: width - 1].center(width - 1) + '.'
+            header += '|'
         else:
-            header = ""
+            header = ''
 
         return header
 
@@ -835,8 +841,8 @@ class Chart(object):
 
         return (
             self.pretty_format_leaves(width)
-            + "\n"
-            + "\n".join(self.pretty_format_edge(edge, width) for edge in edges)
+            + '\n'
+            + '\n'.join(self.pretty_format_edge(edge, width) for edge in edges)
         )
 
     # ////////////////////////////////////////////////////////////
@@ -845,10 +851,10 @@ class Chart(object):
 
     def dot_digraph(self):
         # Header
-        s = "digraph nltk_chart {\n"
+        s = 'digraph nltk_chart {\n'
         # s += '  size="5,5";\n'
-        s += "  rankdir=LR;\n"
-        s += "  node [height=0.1,width=0.1];\n"
+        s += '  rankdir=LR;\n'
+        s += '  node [height=0.1,width=0.1];\n'
         s += '  node [style=filled, color="lightgray"];\n'
 
         # Set up the nodes
@@ -862,28 +868,28 @@ class Chart(object):
                     s += '  %04d.%04d [label=""];\n' % (x, y)
 
         # Add a spacer
-        s += "  x [style=invis]; x->0000.0000 [style=invis];\n"
+        s += '  x [style=invis]; x->0000.0000 [style=invis];\n'
 
         # Declare ranks.
         for x in range(self.num_leaves() + 1):
-            s += "  {rank=same;"
+            s += '  {rank=same;'
             for y in range(self.num_edges() + 1):
                 if y == 0 or (
                     x <= self._edges[y - 1].start() or x >= self._edges[y - 1].end()
                 ):
-                    s += " %04d.%04d" % (x, y)
-            s += "}\n"
+                    s += ' %04d.%04d' % (x, y)
+            s += '}\n'
 
         # Add the leaves
-        s += "  edge [style=invis, weight=100];\n"
-        s += "  node [shape=plaintext]\n"
-        s += "  0000.0000"
+        s += '  edge [style=invis, weight=100];\n'
+        s += '  node [shape=plaintext]\n'
+        s += '  0000.0000'
         for x in range(self.num_leaves()):
-            s += "->%s->%04d.0000" % (self.leaf(x), x + 1)
-        s += ";\n\n"
+            s += '->%s->%04d.0000' % (self.leaf(x), x + 1)
+        s += ';\n\n'
 
         # Add the edges
-        s += "  edge [style=solid, weight=1];\n"
+        s += '  edge [style=solid, weight=1];\n'
         for y, edge in enumerate(self):
             for x in range(edge.start()):
                 s += '  %04d.%04d -> %04d.%04d [style="invis"];\n' % (
@@ -906,7 +912,7 @@ class Chart(object):
                     x + 1,
                     y + 1,
                 )
-        s += "}\n"
+        s += '}\n'
         return s
 
 
@@ -962,6 +968,7 @@ class ChartRuleI(object):
         raise NotImplementedError()
 
 
+@python_2_unicode_compatible
 class AbstractChartRule(ChartRuleI):
     """
     An abstract base class for chart rules.  ``AbstractChartRule``
@@ -1004,12 +1011,12 @@ class AbstractChartRule(ChartRuleI):
                             yield new_edge
 
         else:
-            raise AssertionError("NUM_EDGES>3 is not currently supported")
+            raise AssertionError('NUM_EDGES>3 is not currently supported')
 
     # Default: return a name based on the class name.
     def __str__(self):
         # Add spaces between InitialCapsWords.
-        return re.sub("([a-z])([A-Z])", r"\1 \2", self.__class__.__name__)
+        return re.sub('([a-z])([A-Z])', r'\1 \2', self.__class__.__name__)
 
 
 # ////////////////////////////////////////////////////////////
@@ -1421,7 +1428,7 @@ class ChartParser(ParserI):
         print_rule_header = trace > 1
         for edge in new_edges:
             if print_rule_header:
-                print("%s:" % rule)
+                print('%s:' % rule)
                 print_rule_header = False
             print(chart.pretty_format_edge(edge, edge_width))
 
@@ -1589,7 +1596,7 @@ class SteppingChartParser(ChartParser):
         added with the current strategy and grammar.
         """
         if self._chart is None:
-            raise ValueError("Parser must be initialized first")
+            raise ValueError('Parser must be initialized first')
         while True:
             self._restart = False
             w = 50 // (self._chart.num_leaves() + 1)
@@ -1736,7 +1743,7 @@ def demo(
     print_grammar=False,
     print_trees=True,
     trace=2,
-    sent="I saw John with a dog with my cookie",
+    sent='I saw John with a dog with my cookie',
     numparses=5,
 ):
     """
@@ -1761,34 +1768,34 @@ def demo(
     # Ask the user which parser to test,
     # if the parser wasn't provided as an argument
     if choice is None:
-        print("  1: Top-down chart parser")
-        print("  2: Bottom-up chart parser")
-        print("  3: Bottom-up left-corner chart parser")
-        print("  4: Left-corner chart parser with bottom-up filter")
-        print("  5: Stepping chart parser (alternating top-down & bottom-up)")
-        print("  6: All parsers")
-        print("\nWhich parser (1-6)? ", end=" ")
+        print('  1: Top-down chart parser')
+        print('  2: Bottom-up chart parser')
+        print('  3: Bottom-up left-corner chart parser')
+        print('  4: Left-corner chart parser with bottom-up filter')
+        print('  5: Stepping chart parser (alternating top-down & bottom-up)')
+        print('  6: All parsers')
+        print('\nWhich parser (1-6)? ', end=' ')
         choice = sys.stdin.readline().strip()
         print()
 
     choice = str(choice)
     if choice not in "123456":
-        print("Bad parser number")
+        print('Bad parser number')
         return
 
     # Keep track of how long each parser takes.
     times = {}
 
     strategies = {
-        "1": ("Top-down", TD_STRATEGY),
-        "2": ("Bottom-up", BU_STRATEGY),
-        "3": ("Bottom-up left-corner", BU_LC_STRATEGY),
-        "4": ("Filtered left-corner", LC_STRATEGY),
+        '1': ('Top-down', TD_STRATEGY),
+        '2': ('Bottom-up', BU_STRATEGY),
+        '3': ('Bottom-up left-corner', BU_LC_STRATEGY),
+        '4': ('Filtered left-corner', LC_STRATEGY),
     }
     choices = []
     if choice in strategies:
         choices = [choice]
-    if choice == "6":
+    if choice == '6':
         choices = "1234"
 
     # Run the requested chart parser(s), except the stepping parser.
@@ -1803,7 +1810,7 @@ def demo(
         times[strategies[strategy][0]] = time.time() - t
         print("Nr edges in chart:", len(chart.edges()))
         if numparses:
-            assert len(parses) == numparses, "Not all parses found"
+            assert len(parses) == numparses, 'Not all parses found'
         if print_trees:
             for tree in parses:
                 print(tree)
@@ -1819,20 +1826,20 @@ def demo(
         cp = SteppingChartParser(grammar, trace=trace)
         cp.initialize(tokens)
         for i in range(5):
-            print("*** SWITCH TO TOP DOWN")
+            print('*** SWITCH TO TOP DOWN')
             cp.set_strategy(TD_STRATEGY)
             for j, e in enumerate(cp.step()):
                 if j > 20 or e is None:
                     break
-            print("*** SWITCH TO BOTTOM UP")
+            print('*** SWITCH TO BOTTOM UP')
             cp.set_strategy(BU_STRATEGY)
             for j, e in enumerate(cp.step()):
                 if j > 20 or e is None:
                     break
-        times["Stepping"] = time.time() - t
+        times['Stepping'] = time.time() - t
         print("Nr edges in chart:", len(cp.chart().edges()))
         if numparses:
-            assert len(list(cp.parses())) == numparses, "Not all parses found"
+            assert len(list(cp.parses())) == numparses, 'Not all parses found'
         if print_trees:
             for tree in cp.parses():
                 print(tree)
@@ -1846,11 +1853,11 @@ def demo(
     print("* Parsing times")
     print()
     maxlen = max(len(key) for key in times)
-    format = "%" + repr(maxlen) + "s parser: %6.3fsec"
+    format = '%' + repr(maxlen) + 's parser: %6.3fsec'
     times_items = times.items()
     for (parser, t) in sorted(times_items, key=lambda a: a[1]):
         print(format % (parser, t))
 
 
-if __name__ == "__main__":
+if __name__ == '__main__':
     demo()
index 1ba4801..1c4f785 100644 (file)
@@ -1,12 +1,14 @@
 # -*- coding: utf-8 -*-
 # Natural Language Toolkit: Interface to the CoreNLP REST API.
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Dmitrijs Milajevs <dimazest@gmail.com>
 #
 # URL: <http://nltk.org/>
 # For license information, see LICENSE.TXT
 
+from __future__ import unicode_literals
+
 import re
 import json
 import time
@@ -22,7 +24,7 @@ from nltk.tree import Tree
 
 from unittest import skip
 
-_stanford_url = "http://stanfordnlp.github.io/CoreNLP/"
+_stanford_url = 'http://stanfordnlp.github.io/CoreNLP/'
 
 
 class CoreNLPServerError(EnvironmentError):
@@ -31,7 +33,7 @@ class CoreNLPServerError(EnvironmentError):
 
 def try_port(port=0):
     sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
-    sock.bind(("", port))
+    sock.bind(('', port))
 
     p = sock.getsockname()[1]
     sock.close()
@@ -41,8 +43,8 @@ def try_port(port=0):
 
 class CoreNLPServer(object):
 
-    _MODEL_JAR_PATTERN = r"stanford-corenlp-(\d+)\.(\d+)\.(\d+)-models\.jar"
-    _JAR = r"stanford-corenlp-(\d+)\.(\d+)\.(\d+)\.jar"
+    _MODEL_JAR_PATTERN = r'stanford-corenlp-(\d+)\.(\d+)\.(\d+)-models\.jar'
+    _JAR = r'stanford-corenlp-(\d+)\.(\d+)\.(\d+)\.jar'
 
     def __init__(
         self,
@@ -55,13 +57,13 @@ class CoreNLPServer(object):
     ):
 
         if corenlp_options is None:
-            corenlp_options = ["-preload", "tokenize,ssplit,pos,lemma,parse,depparse"]
+            corenlp_options = ['-preload', 'tokenize,ssplit,pos,lemma,parse,depparse']
 
         jars = list(
             find_jar_iter(
                 self._JAR,
                 path_to_jar,
-                env_vars=("CORENLP",),
+                env_vars=('CORENLP',),
                 searchpath=(),
                 url=_stanford_url,
                 verbose=verbose,
@@ -81,13 +83,13 @@ class CoreNLPServer(object):
         else:
             try_port(port)
 
-        self.url = "http://localhost:{}".format(port)
+        self.url = 'http://localhost:{}'.format(port)
 
         model_jar = max(
             find_jar_iter(
                 self._MODEL_JAR_PATTERN,
                 path_to_models_jar,
-                env_vars=("CORENLP_MODELS",),
+                env_vars=('CORENLP_MODELS',),
                 searchpath=(),
                 url=_stanford_url,
                 verbose=verbose,
@@ -101,31 +103,29 @@ class CoreNLPServer(object):
         self._classpath = stanford_jar, model_jar
 
         self.corenlp_options = corenlp_options
-        self.java_options = java_options or ["-mx2g"]
-
-    def start(self, stdout="devnull", stderr="devnull"):
-        """ Starts the CoreNLP server
+        self.java_options = java_options or ['-mx2g']
 
-        :param stdout, stderr: Specifies where CoreNLP output is redirected. Valid values are 'devnull', 'stdout', 'pipe'
-        """
+    def start(self):
         import requests
 
-        cmd = ["edu.stanford.nlp.pipeline.StanfordCoreNLPServer"]
+        cmd = ['edu.stanford.nlp.pipeline.StanfordCoreNLPServer']
 
         if self.corenlp_options:
             cmd.extend(self.corenlp_options)
 
         # Configure java.
-        default_options = " ".join(_java_options)
+        default_options = ' '.join(_java_options)
         config_java(options=self.java_options, verbose=self.verbose)
 
         try:
+            # TODO: it's probably a bad idea to pipe stdout, as it will
+            #       accumulate when lots of text is being parsed.
             self.popen = java(
                 cmd,
                 classpath=self._classpath,
                 blocking=False,
-                stdout=stdout,
-                stderr=stderr,
+                stdout='pipe',
+                stderr='pipe',
             )
         finally:
             # Return java configurations to their default values.
@@ -137,31 +137,31 @@ class CoreNLPServer(object):
             _, stderrdata = self.popen.communicate()
             raise CoreNLPServerError(
                 returncode,
-                "Could not start the server. "
-                "The error was: {}".format(stderrdata.decode("ascii")),
+                'Could not start the server. '
+                'The error was: {}'.format(stderrdata.decode('ascii')),
             )
 
         for i in range(30):
             try:
-                response = requests.get(requests.compat.urljoin(self.url, "live"))
+                response = requests.get(requests.compat.urljoin(self.url, 'live'))
             except requests.exceptions.ConnectionError:
                 time.sleep(1)
             else:
                 if response.ok:
                     break
         else:
-            raise CoreNLPServerError("Could not connect to the server.")
+            raise CoreNLPServerError('Could not connect to the server.')
 
         for i in range(60):
             try:
-                response = requests.get(requests.compat.urljoin(self.url, "ready"))
+                response = requests.get(requests.compat.urljoin(self.url, 'ready'))
             except requests.exceptions.ConnectionError:
                 time.sleep(1)
             else:
                 if response.ok:
                     break
         else:
-            raise CoreNLPServerError("The server is not ready.")
+            raise CoreNLPServerError('The server is not ready.')
 
     def stop(self):
         self.popen.terminate()
@@ -180,13 +180,13 @@ class CoreNLPServer(object):
 class GenericCoreNLPParser(ParserI, TokenizerI, TaggerI):
     """Interface to the CoreNLP Parser."""
 
-    def __init__(self, url="http://localhost:9000", encoding="utf8", tagtype=None):
+    def __init__(self, url='http://localhost:9000', encoding='utf8', tagtype=None):
         import requests
 
         self.url = url
         self.encoding = encoding
 
-        if tagtype not in ["pos", "ner", None]:
+        if tagtype not in ['pos', 'ner', None]:
             raise ValueError("tagtype must be either 'pos', 'ner' or None")
 
         self.tagtype = tagtype
@@ -208,7 +208,7 @@ class GenericCoreNLPParser(ParserI, TokenizerI, TaggerI):
         :rtype: iter(iter(Tree))
         """
         # Converting list(list(str)) -> list(str)
-        sentences = (" ".join(words) for words in sentences)
+        sentences = (' '.join(words) for words in sentences)
         return self.raw_parse_sents(sentences, *args, **kwargs)
 
     def raw_parse(self, sentence, properties=None, *args, **kwargs):
@@ -221,7 +221,7 @@ class GenericCoreNLPParser(ParserI, TokenizerI, TaggerI):
         :type sentence: str
         :rtype: iter(Tree)
         """
-        default_properties = {"tokenize.whitespace": "false"}
+        default_properties = {'tokenize.whitespace': 'false'}
         default_properties.update(properties or {})
 
         return next(
@@ -230,10 +230,10 @@ class GenericCoreNLPParser(ParserI, TokenizerI, TaggerI):
             )
         )
 
-    def api_call(self, data, properties=None, timeout=60):
+    def api_call(self, data, properties=None):
         default_properties = {
-            "outputFormat": "json",
-            "annotators": "tokenize,pos,lemma,ssplit,{parser_annotator}".format(
+            'outputFormat': 'json',
+            'annotators': 'tokenize,pos,lemma,ssplit,{parser_annotator}'.format(
                 parser_annotator=self.parser_annotator
             ),
         }
@@ -242,9 +242,9 @@ class GenericCoreNLPParser(ParserI, TokenizerI, TaggerI):
 
         response = self.session.post(
             self.url,
-            params={"properties": json.dumps(default_properties)},
+            params={'properties': json.dumps(default_properties)},
             data=data.encode(self.encoding),
-            timeout=timeout,
+            timeout=60,
         )
 
         response.raise_for_status()
@@ -266,7 +266,7 @@ class GenericCoreNLPParser(ParserI, TokenizerI, TaggerI):
         """
         default_properties = {
             # Only splits on '\n', never inside the sentence.
-            "ssplit.eolonly": "true"
+            'ssplit.ssplit.eolonly': 'true'
         }
 
         default_properties.update(properties or {})
@@ -281,8 +281,8 @@ class GenericCoreNLPParser(ParserI, TokenizerI, TaggerI):
                 tree = self.make_tree(parse)
                 yield iter([tree])
         """
-        parsed_data = self.api_call("\n".join(sentences), properties=default_properties)
-        for parsed_sent in parsed_data["sentences"]:
+        parsed_data = self.api_call('\n'.join(sentences), properties=default_properties)
+        for parsed_sent in parsed_data['sentences']:
             tree = self.make_tree(parsed_sent)
             yield iter([tree])
 
@@ -297,7 +297,7 @@ class GenericCoreNLPParser(ParserI, TokenizerI, TaggerI):
         """
         parsed_data = self.api_call(text, *args, **kwargs)
 
-        for parse in parsed_data["sentences"]:
+        for parse in parsed_data['sentences']:
             yield self.make_tree(parse)
 
     def tokenize(self, text, properties=None):
@@ -319,15 +319,15 @@ class GenericCoreNLPParser(ParserI, TokenizerI, TaggerI):
         ['The', 'color', 'of', 'the', 'wall', 'is', 'blue', '.']
 
         """
-        default_properties = {"annotators": "tokenize,ssplit"}
+        default_properties = {'annotators': 'tokenize,ssplit'}
 
         default_properties.update(properties or {})
 
         result = self.api_call(text, properties=default_properties)
 
-        for sentence in result["sentences"]:
-            for token in sentence["tokens"]:
-                yield token["originalText"] or token["word"]
+        for sentence in result['sentences']:
+            for token in sentence['tokens']:
+                yield token['originalText'] or token['word']
 
     def tag_sents(self, sentences):
         """
@@ -335,13 +335,13 @@ class GenericCoreNLPParser(ParserI, TokenizerI, TaggerI):
 
         Takes multiple sentences as a list where each sentence is a list of
         tokens.
-
+        
         :param sentences: Input sentences to tag
         :type sentences: list(list(str))
         :rtype: list(list(tuple(str, str))
         """
         # Converting list(list(str)) -> list(str)
-        sentences = (" ".join(words) for words in sentences)
+        sentences = (' '.join(words) for words in sentences)
         return [sentences[0] for sentences in self.raw_tag_sents(sentences)]
 
     def tag(self, sentence):
@@ -370,27 +370,27 @@ class GenericCoreNLPParser(ParserI, TokenizerI, TaggerI):
         Tag multiple sentences.
 
         Takes multiple sentences as a list where each sentence is a string.
-
+        
         :param sentences: Input sentences to tag
         :type sentences: list(str)
         :rtype: list(list(list(tuple(str, str)))
         """
         default_properties = {
-            "ssplit.isOneSentence": "true",
-            "annotators": "tokenize,ssplit,",
+            'ssplit.isOneSentence': 'true',
+            'annotators': 'tokenize,ssplit,',
         }
 
         # Supports only 'pos' or 'ner' tags.
-        assert self.tagtype in ["pos", "ner"]
-        default_properties["annotators"] += self.tagtype
+        assert self.tagtype in ['pos', 'ner']
+        default_properties['annotators'] += self.tagtype
         for sentence in sentences:
             tagged_data = self.api_call(sentence, properties=default_properties)
             yield [
                 [
-                    (token["word"], token[self.tagtype])
-                    for token in tagged_sentence["tokens"]
+                    (token['word'], token[self.tagtype])
+                    for token in tagged_sentence['tokens']
                 ]
-                for tagged_sentence in tagged_data["sentences"]
+                for tagged_sentence in tagged_data['sentences']
             ]
 
 
@@ -539,11 +539,11 @@ class CoreNLPParser(GenericCoreNLPParser):
 
     """
 
-    _OUTPUT_FORMAT = "penn"
-    parser_annotator = "parse"
+    _OUTPUT_FORMAT = 'penn'
+    parser_annotator = 'parse'
 
     def make_tree(self, result):
-        return Tree.fromstring(result["parse"])
+        return Tree.fromstring(result['parse'])
 
 
 class CoreNLPDependencyParser(GenericCoreNLPParser):
@@ -710,44 +710,44 @@ class CoreNLPDependencyParser(GenericCoreNLPParser):
 
     """
 
-    _OUTPUT_FORMAT = "conll2007"
-    parser_annotator = "depparse"
+    _OUTPUT_FORMAT = 'conll2007'
+    parser_annotator = 'depparse'
 
     def make_tree(self, result):
 
         return DependencyGraph(
             (
-                " ".join(n_items[1:])  # NLTK expects an iterable of strings...
+                ' '.join(n_items[1:])  # NLTK expects an iterable of strings...
                 for n_items in sorted(transform(result))
             ),
-            cell_separator=" ",  # To make sure that a non-breaking space is kept inside of a token.
+            cell_separator=' ',  # To make sure that a non-breaking space is kept inside of a token.
         )
 
 
 def transform(sentence):
-    for dependency in sentence["basicDependencies"]:
+    for dependency in sentence['basicDependencies']:
 
-        dependent_index = dependency["dependent"]
-        token = sentence["tokens"][dependent_index - 1]
+        dependent_index = dependency['dependent']
+        token = sentence['tokens'][dependent_index - 1]
 
         # Return values that we don't know as '_'. Also, consider tag and ctag
         # to be equal.
         yield (
             dependent_index,
-            "_",
-            token["word"],
-            token["lemma"],
-            token["pos"],
-            token["pos"],
-            "_",
-            str(dependency["governor"]),
-            dependency["dep"],
-            "_",
-            "_",
+            '_',
+            token['word'],
+            token['lemma'],
+            token['pos'],
+            token['pos'],
+            '_',
+            str(dependency['governor']),
+            dependency['dep'],
+            '_',
+            '_',
         )
 
 
-@skip("Skipping all CoreNLP tests.")
+@skip('Skipping all CoreNLP tests.')
 def setup_module(module):
     from nose import SkipTest
 
@@ -756,18 +756,18 @@ def setup_module(module):
     try:
         server = CoreNLPServer(port=9000)
     except LookupError as e:
-        raise SkipTest("Could not instantiate CoreNLPServer.")
+        raise SkipTest('Could not instantiate CoreNLPServer.')
 
     try:
         server.start()
     except CoreNLPServerError as e:
         raise SkipTest(
-            "Skipping CoreNLP tests because the server could not be started. "
-            "Make sure that the 9000 port is free. "
-            "{}".format(e.strerror)
+            'Skipping CoreNLP tests because the server could not be started. '
+            'Make sure that the 9000 port is free. '
+            '{}'.format(e.strerror)
         )
 
 
-@skip("Skipping all CoreNLP tests.")
+@skip('Skipping all CoreNLP tests.')
 def teardown_module(module):
     server.stop()
index 4d3f7c5..8c6156b 100644 (file)
@@ -1,6 +1,6 @@
 # Natural Language Toolkit: Dependency Grammars
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Jason Narad <jason.narad@gmail.com>
 #         Steven Bird <stevenbird1@gmail.com> (modifications)
 #
@@ -13,6 +13,7 @@ Tools for reading and writing dependency trees.
 The input is assumed to be in Malt-TAB format
 (http://stp.lingfil.uu.se/~nivre/research/MaltXML.html).
 """
+from __future__ import print_function, unicode_literals
 
 from collections import defaultdict
 from itertools import chain
@@ -20,13 +21,18 @@ from pprint import pformat
 import subprocess
 import warnings
 
+from six import string_types
+
 from nltk.tree import Tree
+from nltk.compat import python_2_unicode_compatible
+
 
 #################################################################
 # DependencyGraph Class
 #################################################################
 
 
+@python_2_unicode_compatible
 class DependencyGraph(object):
     """
     A container for the nodes and labelled edges of a dependency structure.
@@ -38,7 +44,7 @@ class DependencyGraph(object):
         cell_extractor=None,
         zero_based=False,
         cell_separator=None,
-        top_relation_label="ROOT",
+        top_relation_label='ROOT',
     ):
         """Dependency graph.
 
@@ -59,19 +65,19 @@ class DependencyGraph(object):
         """
         self.nodes = defaultdict(
             lambda: {
-                "address": None,
-                "word": None,
-                "lemma": None,
-                "ctag": None,
-                "tag": None,
-                "feats": None,
-                "head": None,
-                "deps": defaultdict(list),
-                "rel": None,
+                'address': None,
+                'word': None,
+                'lemma': None,
+                'ctag': None,
+                'tag': None,
+                'feats': None,
+                'head': None,
+                'deps': defaultdict(list),
+                'rel': None,
             }
         )
 
-        self.nodes[0].update({"ctag": "TOP", "tag": "TOP", "address": 0})
+        self.nodes[0].update({'ctag': 'TOP', 'tag': 'TOP', 'address': 0})
 
         self.root = None
 
@@ -98,21 +104,21 @@ class DependencyGraph(object):
         """
         for node in self.nodes.values():
             new_deps = []
-            for dep in node["deps"]:
+            for dep in node['deps']:
                 if dep in originals:
                     new_deps.append(redirect)
                 else:
                     new_deps.append(dep)
-            node["deps"] = new_deps
+            node['deps'] = new_deps
 
     def add_arc(self, head_address, mod_address):
         """
         Adds an arc from the node specified by head_address to the
         node specified by the mod address.
         """
-        relation = self.nodes[mod_address]["rel"]
-        self.nodes[head_address]["deps"].setdefault(relation, [])
-        self.nodes[head_address]["deps"][relation].append(mod_address)
+        relation = self.nodes[mod_address]['rel']
+        self.nodes[head_address]['deps'].setdefault(relation, [])
+        self.nodes[head_address]['deps'][relation].append(mod_address)
         # self.nodes[head_address]['deps'].append(mod_address)
 
     def connect_graph(self):
@@ -122,10 +128,10 @@ class DependencyGraph(object):
         """
         for node1 in self.nodes.values():
             for node2 in self.nodes.values():
-                if node1["address"] != node2["address"] and node2["rel"] != "TOP":
-                    relation = node2["rel"]
-                    node1["deps"].setdefault(relation, [])
-                    node1["deps"][relation].append(node2["address"])
+                if node1['address'] != node2['address'] and node2['rel'] != 'TOP':
+                    relation = node2['rel']
+                    node1['deps'].setdefault(relation, [])
+                    node1['deps'][relation].append(node2['address'])
                     # node1['deps'].append(node2['address'])
 
     def get_by_address(self, node_address):
@@ -163,23 +169,23 @@ class DependencyGraph(object):
 
         """
         # Start the digraph specification
-        s = "digraph G{\n"
-        s += "edge [dir=forward]\n"
-        s += "node [shape=plaintext]\n"
+        s = 'digraph G{\n'
+        s += 'edge [dir=forward]\n'
+        s += 'node [shape=plaintext]\n'
 
         # Draw the remaining nodes
-        for node in sorted(self.nodes.values(), key=lambda v: v["address"]):
+        for node in sorted(self.nodes.values(), key=lambda v: v['address']):
             s += '\n%s [label="%s (%s)"]' % (
-                node["address"],
-                node["address"],
-                node["word"],
+                node['address'],
+                node['address'],
+                node['word'],
             )
-            for rel, deps in node["deps"].items():
+            for rel, deps in node['deps'].items():
                 for dep in deps:
                     if rel is not None:
-                        s += '\n%s -> %s [label="%s"]' % (node["address"], dep, rel)
+                        s += '\n%s -> %s [label="%s"]' % (node['address'], dep, rel)
                     else:
-                        s += "\n%s -> %s " % (node["address"], dep)
+                        s += '\n%s -> %s ' % (node['address'], dep)
         s += "\n}"
 
         return s
@@ -200,19 +206,19 @@ class DependencyGraph(object):
 
         try:
             process = subprocess.Popen(
-                ["dot", "-Tsvg"],
+                ['dot', '-Tsvg'],
                 stdin=subprocess.PIPE,
                 stdout=subprocess.PIPE,
                 stderr=subprocess.PIPE,
                 universal_newlines=True,
             )
         except OSError:
-            raise Exception("Cannot find the dot binary from Graphviz package")
+            raise Exception('Cannot find the dot binary from Graphviz package')
         out, err = process.communicate(dot_string)
         if err:
             raise Exception(
-                "Cannot create svg representation by running dot from string: {}"
-                "".format(dot_string)
+                'Cannot create svg representation by running dot from string: {}'
+                ''.format(dot_string)
             )
         return out
 
@@ -224,7 +230,7 @@ class DependencyGraph(object):
 
     @staticmethod
     def load(
-        filename, zero_based=False, cell_separator=None, top_relation_label="ROOT"
+        filename, zero_based=False, cell_separator=None, top_relation_label='ROOT'
     ):
         """
         :param filename: a name of a file in Malt-TAB format
@@ -246,7 +252,7 @@ class DependencyGraph(object):
                     cell_separator=cell_separator,
                     top_relation_label=top_relation_label,
                 )
-                for tree_str in infile.read().split("\n\n")
+                for tree_str in infile.read().split('\n\n')
             ]
 
     def left_children(self, node_index):
@@ -254,8 +260,8 @@ class DependencyGraph(object):
         Returns the number of left children under the node specified
         by the given address.
         """
-        children = chain.from_iterable(self.nodes[node_index]["deps"].values())
-        index = self.nodes[node_index]["address"]
+        children = chain.from_iterable(self.nodes[node_index]['deps'].values())
+        index = self.nodes[node_index]['address']
         return sum(1 for c in children if c < index)
 
     def right_children(self, node_index):
@@ -263,13 +269,13 @@ class DependencyGraph(object):
         Returns the number of right children under the node specified
         by the given address.
         """
-        children = chain.from_iterable(self.nodes[node_index]["deps"].values())
-        index = self.nodes[node_index]["address"]
+        children = chain.from_iterable(self.nodes[node_index]['deps'].values())
+        index = self.nodes[node_index]['address']
         return sum(1 for c in children if c > index)
 
     def add_node(self, node):
-        if not self.contains_address(node["address"]):
-            self.nodes[node["address"]].update(node)
+        if not self.contains_address(node['address']):
+            self.nodes[node['address']].update(node)
 
     def _parse(
         self,
@@ -277,7 +283,7 @@ class DependencyGraph(object):
         cell_extractor=None,
         zero_based=False,
         cell_separator=None,
-        top_relation_label="ROOT",
+        top_relation_label='ROOT',
     ):
         """Parse a sentence.
 
@@ -295,11 +301,11 @@ class DependencyGraph(object):
 
         def extract_3_cells(cells, index):
             word, tag, head = cells
-            return index, word, word, tag, tag, "", head, ""
+            return index, word, word, tag, tag, '', head, ''
 
         def extract_4_cells(cells, index):
             word, tag, head, rel = cells
-            return index, word, word, tag, tag, "", head, rel
+            return index, word, word, tag, tag, '', head, rel
 
         def extract_7_cells(cells, index):
             line_index, word, lemma, tag, _, head, rel = cells
@@ -308,7 +314,7 @@ class DependencyGraph(object):
             except ValueError:
                 # index can't be parsed as an integer, use default
                 pass
-            return index, word, lemma, tag, tag, "", head, rel
+            return index, word, lemma, tag, tag, '', head, rel
 
         def extract_10_cells(cells, index):
             line_index, word, lemma, ctag, tag, feats, head, rel, _, _ = cells
@@ -326,8 +332,8 @@ class DependencyGraph(object):
             10: extract_10_cells,
         }
 
-        if isinstance(input_, str):
-            input_ = (line for line in input_.split("\n"))
+        if isinstance(input_, string_types):
+            input_ = (line for line in input_.split('\n'))
 
         lines = (l.rstrip() for l in input_)
         lines = (l for l in lines if l)
@@ -345,8 +351,8 @@ class DependencyGraph(object):
                     cell_extractor = extractors[cell_number]
                 except KeyError:
                     raise ValueError(
-                        "Number of tab-delimited fields ({0}) not supported by "
-                        "CoNLL(10) or Malt-Tab(4) format".format(cell_number)
+                        'Number of tab-delimited fields ({0}) not supported by '
+                        'CoNLL(10) or Malt-Tab(4) format'.format(cell_number)
                     )
 
             try:
@@ -359,7 +365,7 @@ class DependencyGraph(object):
                 # extractor and doesn't accept or return an index.
                 word, lemma, ctag, tag, feats, head, rel = cell_extractor(cells)
 
-            if head == "_":
+            if head == '_':
                 continue
 
             head = int(head)
@@ -368,24 +374,24 @@ class DependencyGraph(object):
 
             self.nodes[index].update(
                 {
-                    "address": index,
-                    "word": word,
-                    "lemma": lemma,
-                    "ctag": ctag,
-                    "tag": tag,
-                    "feats": feats,
-                    "head": head,
-                    "rel": rel,
+                    'address': index,
+                    'word': word,
+                    'lemma': lemma,
+                    'ctag': ctag,
+                    'tag': tag,
+                    'feats': feats,
+                    'head': head,
+                    'rel': rel,
                 }
             )
 
             # Make sure that the fake root node has labeled dependencies.
             if (cell_number == 3) and (head == 0):
                 rel = top_relation_label
-            self.nodes[head]["deps"][rel].append(index)
+            self.nodes[head]['deps'][rel].append(index)
 
-        if self.nodes[0]["deps"][top_relation_label]:
-            root_address = self.nodes[0]["deps"][top_relation_label][0]
+        if self.nodes[0]['deps'][top_relation_label]:
+            root_address = self.nodes[0]['deps'][top_relation_label][0]
             self.root = self.nodes[root_address]
             self.top_relation_label = top_relation_label
         else:
@@ -394,9 +400,9 @@ class DependencyGraph(object):
             )
 
     def _word(self, node, filter=True):
-        w = node["word"]
+        w = node['word']
         if filter:
-            if w != ",":
+            if w != ',':
                 return w
         return w
 
@@ -407,8 +413,8 @@ class DependencyGraph(object):
         :return: either a word (if the indexed node is a leaf) or a ``Tree``.
         """
         node = self.get_by_address(i)
-        word = node["word"]
-        deps = sorted(chain.from_iterable(node["deps"].values()))
+        word = node['word']
+        deps = sorted(chain.from_iterable(node['deps'].values()))
 
         if deps:
             return Tree(word, [self._tree(dep) for dep in deps])
@@ -422,8 +428,8 @@ class DependencyGraph(object):
         """
         node = self.root
 
-        word = node["word"]
-        deps = sorted(chain.from_iterable(node["deps"].values()))
+        word = node['word']
+        deps = sorted(chain.from_iterable(node['deps'].values()))
         return Tree(word, [self._tree(dep) for dep in deps])
 
     def triples(self, node=None):
@@ -435,22 +441,22 @@ class DependencyGraph(object):
         if not node:
             node = self.root
 
-        head = (node["word"], node["ctag"])
-        for i in sorted(chain.from_iterable(node["deps"].values())):
+        head = (node['word'], node['ctag'])
+        for i in sorted(chain.from_iterable(node['deps'].values())):
             dep = self.get_by_address(i)
-            yield (head, dep["rel"], (dep["word"], dep["ctag"]))
+            yield (head, dep['rel'], (dep['word'], dep['ctag']))
             for triple in self.triples(node=dep):
                 yield triple
 
     def _hd(self, i):
         try:
-            return self.nodes[i]["head"]
+            return self.nodes[i]['head']
         except IndexError:
             return None
 
     def _rel(self, i):
         try:
-            return self.nodes[i]["rel"]
+            return self.nodes[i]['rel']
         except IndexError:
             return None
 
@@ -484,8 +490,8 @@ class DependencyGraph(object):
         distances = {}
 
         for node in self.nodes.values():
-            for dep in node["deps"]:
-                key = tuple([node["address"], dep])
+            for dep in node['deps']:
+                key = tuple([node['address'], dep])
                 distances[key] = 1
 
         for _ in self.nodes:
@@ -506,13 +512,13 @@ class DependencyGraph(object):
         return False  # return []?
 
     def get_cycle_path(self, curr_node, goal_node_index):
-        for dep in curr_node["deps"]:
+        for dep in curr_node['deps']:
             if dep == goal_node_index:
-                return [curr_node["address"]]
-        for dep in curr_node["deps"]:
+                return [curr_node['address']]
+        for dep in curr_node['deps']:
             path = self.get_cycle_path(self.get_by_address(dep), goal_node_index)
             if len(path) > 0:
-                path.insert(0, curr_node["address"])
+                path.insert(0, curr_node['address'])
                 return path
         return []
 
@@ -526,23 +532,23 @@ class DependencyGraph(object):
         """
 
         if style == 3:
-            template = "{word}\t{tag}\t{head}\n"
+            template = '{word}\t{tag}\t{head}\n'
         elif style == 4:
-            template = "{word}\t{tag}\t{head}\t{rel}\n"
+            template = '{word}\t{tag}\t{head}\t{rel}\n'
         elif style == 10:
             template = (
-                "{i}\t{word}\t{lemma}\t{ctag}\t{tag}\t{feats}\t{head}\t{rel}\t_\t_\n"
+                '{i}\t{word}\t{lemma}\t{ctag}\t{tag}\t{feats}\t{head}\t{rel}\t_\t_\n'
             )
         else:
             raise ValueError(
-                "Number of tab-delimited fields ({0}) not supported by "
-                "CoNLL(10) or Malt-Tab(4) format".format(style)
+                'Number of tab-delimited fields ({0}) not supported by '
+                'CoNLL(10) or Malt-Tab(4) format'.format(style)
             )
 
-        return "".join(
+        return ''.join(
             template.format(i=i, **node)
             for i, node in sorted(self.nodes.items())
-            if node["tag"] != "TOP"
+            if node['tag'] != 'TOP'
         )
 
     def nx_graph(self):
@@ -555,7 +561,7 @@ class DependencyGraph(object):
         ]
         self.nx_labels = {}
         for n in nx_nodelist:
-            self.nx_labels[n] = self.nodes[n]["word"]
+            self.nx_labels[n] = self.nodes[n]['word']
 
         g = networkx.MultiDiGraph()
         g.add_nodes_from(nx_nodelist)
@@ -616,7 +622,7 @@ Nov.    NNP     9       VMOD
         networkx.draw_networkx_labels(g, pos, dg.nx_labels)
         pylab.xticks([])
         pylab.yticks([])
-        pylab.savefig("tree.png")
+        pylab.savefig('tree.png')
         pylab.show()
 
 
@@ -633,11 +639,11 @@ def conll_demo():
 
 
 def conll_file_demo():
-    print("Mass conll_read demo...")
-    graphs = [DependencyGraph(entry) for entry in conll_data2.split("\n\n") if entry]
+    print('Mass conll_read demo...')
+    graphs = [DependencyGraph(entry) for entry in conll_data2.split('\n\n') if entry]
     for graph in graphs:
         tree = graph.tree()
-        print("\n")
+        print('\n')
         tree.pprint()
 
 
@@ -645,11 +651,11 @@ def cycle_finding_demo():
     dg = DependencyGraph(treebank_data)
     print(dg.contains_cycle())
     cyclic_dg = DependencyGraph()
-    cyclic_dg.add_node({"word": None, "deps": [1], "rel": "TOP", "address": 0})
-    cyclic_dg.add_node({"word": None, "deps": [2], "rel": "NTOP", "address": 1})
-    cyclic_dg.add_node({"word": None, "deps": [4], "rel": "NTOP", "address": 2})
-    cyclic_dg.add_node({"word": None, "deps": [1], "rel": "NTOP", "address": 3})
-    cyclic_dg.add_node({"word": None, "deps": [3], "rel": "NTOP", "address": 4})
+    cyclic_dg.add_node({'word': None, 'deps': [1], 'rel': 'TOP', 'address': 0})
+    cyclic_dg.add_node({'word': None, 'deps': [2], 'rel': 'NTOP', 'address': 1})
+    cyclic_dg.add_node({'word': None, 'deps': [4], 'rel': 'NTOP', 'address': 2})
+    cyclic_dg.add_node({'word': None, 'deps': [1], 'rel': 'NTOP', 'address': 3})
+    cyclic_dg.add_node({'word': None, 'deps': [3], 'rel': 'NTOP', 'address': 4})
     print(cyclic_dg.contains_cycle())
 
 
@@ -775,5 +781,5 @@ conll_data2 = """1   Cathy             Cathy             N     N     eigen|ev|ne
 16  .                 .                 Punc  Punc  punt                             15  punct   _  _
 """
 
-if __name__ == "__main__":
+if __name__ == '__main__':
     demo()
index d6cc14e..fdb8136 100644 (file)
@@ -1,7 +1,7 @@
 # -*- coding: utf-8 -*-
 # Natural Language Toolkit: An Incremental Earley Chart Parser
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Peter Ljunglöf <peter.ljunglof@heatherleaf.se>
 #         Rob Speer <rspeer@mit.edu>
 #         Edward Loper <edloper@gmail.com>
@@ -25,8 +25,9 @@ This is appealing for, say, speech recognizer hypothesis filtering.
 The main parser class is ``EarleyChartParser``, which is a top-down
 algorithm, originally formulated by Jay Earley (1970).
 """
+from __future__ import print_function, division
 
-from time import perf_counter
+from six.moves import range
 
 from nltk.parse.chart import (
     Chart,
@@ -99,7 +100,7 @@ class IncrementalChart(Chart):
         # Make sure it's a valid index.
         for key in restr_keys:
             if not hasattr(EdgeI, key):
-                raise ValueError("Bad restriction: %s" % key)
+                raise ValueError('Bad restriction: %s' % key)
 
         # Create the index.
         index = self._indexes[restr_keys] = tuple({} for x in self._positions())
@@ -149,7 +150,7 @@ class FeatureIncrementalChart(IncrementalChart, FeatureChart):
         # Make sure it's a valid index.
         for key in restr_keys:
             if not hasattr(EdgeI, key):
-                raise ValueError("Bad restriction: %s" % key)
+                raise ValueError('Bad restriction: %s' % key)
 
         # Create the index.
         index = self._indexes[restr_keys] = tuple({} for x in self._positions())
@@ -510,7 +511,7 @@ def demo(
     print_grammar=False,
     print_trees=True,
     trace=2,
-    sent="I saw John with a dog with my cookie",
+    sent='I saw John with a dog with my cookie',
     numparses=5,
 ):
     """
@@ -534,14 +535,14 @@ def demo(
 
     # Do the parsing.
     earley = EarleyChartParser(grammar, trace=trace)
-    t = perf_counter()
+    t = time.clock()
     chart = earley.chart_parse(tokens)
     parses = list(chart.parses(grammar.start()))
-    t = perf_counter() - t
+    t = time.clock() - t
 
     # Print results.
     if numparses:
-        assert len(parses) == numparses, "Not all parses found"
+        assert len(parses) == numparses, 'Not all parses found'
     if print_trees:
         for tree in parses:
             print(tree)
@@ -551,5 +552,5 @@ def demo(
         print("Time:", t)
 
 
-if __name__ == "__main__":
+if __name__ == '__main__':
     demo()
index 672ad6d..d79ad46 100644 (file)
@@ -2,10 +2,12 @@
 #
 # Author: Long Duong <longdt219@gmail.com>
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # URL: <http://nltk.org/>
 # For license information, see LICENSE.TXT
 
+from __future__ import division
+
 import unicodedata
 
 
index a06c50f..ee9e274 100644 (file)
@@ -1,7 +1,7 @@
 # -*- coding: utf-8 -*-
 # Natural Language Toolkit: Chart Parser for Feature-Based Grammars
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Rob Speer <rspeer@mit.edu>
 #         Peter Ljunglöf <peter.ljunglof@heatherleaf.se>
 # URL: <http://nltk.org/>
 Extension of chart parsing implementation to handle grammars with
 feature structures as nodes.
 """
-from time import perf_counter
+from __future__ import print_function, unicode_literals
 
+from six.moves import range
+
+from nltk.compat import python_2_unicode_compatible
 from nltk.featstruct import FeatStruct, unify, TYPE, find_variables
 from nltk.sem import logic
 from nltk.tree import Tree
@@ -44,6 +47,7 @@ from nltk.parse.chart import (
 # ////////////////////////////////////////////////////////////
 
 
+@python_2_unicode_compatible
 class FeatureTreeEdge(TreeEdge):
     """
     A specialized tree edge that allows shared variable bindings
@@ -145,12 +149,12 @@ class FeatureTreeEdge(TreeEdge):
 
     def __str__(self):
         if self.is_complete():
-            return super().__str__()
+            return TreeEdge.__unicode__(self)
         else:
-            bindings = "{%s}" % ", ".join(
-                "%s: %r" % item for item in sorted(self._bindings.items())
+            bindings = '{%s}' % ', '.join(
+                '%s: %r' % item for item in sorted(self._bindings.items())
             )
-            return "%s %s" % (super().__str__(), bindings)
+            return '%s %s' % (TreeEdge.__unicode__(self), bindings)
 
 
 # ////////////////////////////////////////////////////////////
@@ -197,7 +201,7 @@ class FeatureChart(Chart):
         # Make sure it's a valid index.
         for key in restr_keys:
             if not hasattr(EdgeI, key):
-                raise ValueError("Bad restriction: %s" % key)
+                raise ValueError('Bad restriction: %s' % key)
 
         # Create the index.
         index = self._indexes[restr_keys] = {}
@@ -583,7 +587,7 @@ class InstantiateVarsChart(FeatureChart):
         return dict(
             (var, logic.unique_variable())
             for var in edge.lhs().variables()
-            if var.name.startswith("@")
+            if var.name.startswith('@')
         )
 
 
@@ -626,7 +630,7 @@ def demo(
     print_sentence=True,
     trace=1,
     parser=FeatureChartParser,
-    sent="I saw John with a dog with my cookie",
+    sent='I saw John with a dog with my cookie',
 ):
     import sys, time
 
@@ -639,12 +643,12 @@ def demo(
     if print_sentence:
         print("Sentence:", sent)
     tokens = sent.split()
-    t = perf_counter()
+    t = time.clock()
     cp = parser(grammar, trace=trace)
     chart = cp.chart_parse(tokens)
     trees = list(chart.parses(grammar.start()))
     if print_times:
-        print("Time: %s" % (perf_counter() - t))
+        print("Time: %s" % (time.clock() - t))
     if print_trees:
         for tree in trees:
             print(tree)
@@ -655,22 +659,22 @@ def demo(
 def run_profile():
     import profile
 
-    profile.run("for i in range(1): demo()", "/tmp/profile.out")
+    profile.run('for i in range(1): demo()', '/tmp/profile.out')
     import pstats
 
-    p = pstats.Stats("/tmp/profile.out")
-    p.strip_dirs().sort_stats("time", "cum").print_stats(60)
-    p.strip_dirs().sort_stats("cum", "time").print_stats(60)
+    p = pstats.Stats('/tmp/profile.out')
+    p.strip_dirs().sort_stats('time', 'cum').print_stats(60)
+    p.strip_dirs().sort_stats('cum', 'time').print_stats(60)
 
 
-if __name__ == "__main__":
+if __name__ == '__main__':
     from nltk.data import load
 
     demo()
     print()
-    grammar = load("grammars/book_grammars/feat0.fcfg")
+    grammar = load('grammars/book_grammars/feat0.fcfg')
     cp = FeatureChartParser(grammar, trace=2)
-    sent = "Kim likes children"
+    sent = 'Kim likes children'
     tokens = sent.split()
     trees = cp.parse(tokens)
     for tree in trees:
index 4549b8d..e0a7cb2 100644 (file)
@@ -1,12 +1,13 @@
 # -*- coding: utf-8 -*-
 # Natural Language Toolkit: Generating from a CFG
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Steven Bird <stevenbird1@gmail.com>
 #         Peter Ljunglöf <peter.ljunglof@heatherleaf.se>
 # URL: <http://nltk.org/>
 # For license information, see LICENSE.TXT
 #
+from __future__ import print_function
 
 import itertools
 import sys
@@ -78,12 +79,12 @@ demo_grammar = """
 def demo(N=23):
     from nltk.grammar import CFG
 
-    print("Generating the first %d sentences for demo grammar:" % (N,))
+    print('Generating the first %d sentences for demo grammar:' % (N,))
     print(demo_grammar)
     grammar = CFG.fromstring(demo_grammar)
     for n, sent in enumerate(generate(grammar, n=N), 1):
-        print("%3d. %s" % (n, " ".join(sent)))
+        print('%3d. %s' % (n, ' '.join(sent)))
 
 
-if __name__ == "__main__":
+if __name__ == '__main__':
     demo()
index 523901e..2523927 100644 (file)
@@ -4,16 +4,20 @@
 # Author: Dan Garrette <dhgarrette@gmail.com>
 # Contributor: Liling Tan, Mustufain, osamamukhtar11
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # URL: <http://nltk.org/>
 # For license information, see LICENSE.TXT
 
+from __future__ import print_function, unicode_literals
+
 import os
 import sys
 import tempfile
 import subprocess
 import inspect
 
+from six import text_type
+
 from nltk.data import ZipFilePathPointer
 from nltk.internals import find_dir, find_file, find_jars_within_path
 
@@ -27,34 +31,34 @@ def malt_regex_tagger():
 
     _tagger = RegexpTagger(
         [
-            (r"\.$", "."),
-            (r"\,$", ","),
-            (r"\?$", "?"),  # fullstop, comma, Qmark
-            (r"\($", "("),
-            (r"\)$", ")"),  # round brackets
-            (r"\[$", "["),
-            (r"\]$", "]"),  # square brackets
-            (r"^-?[0-9]+(.[0-9]+)?$", "CD"),  # cardinal numbers
-            (r"(The|the|A|a|An|an)$", "DT"),  # articles
-            (r"(He|he|She|she|It|it|I|me|Me|You|you)$", "PRP"),  # pronouns
-            (r"(His|his|Her|her|Its|its)$", "PRP$"),  # possesive
-            (r"(my|Your|your|Yours|yours)$", "PRP$"),  # possesive
-            (r"(on|On|in|In|at|At|since|Since)$", "IN"),  # time prepopsitions
-            (r"(for|For|ago|Ago|before|Before)$", "IN"),  # time prepopsitions
-            (r"(till|Till|until|Until)$", "IN"),  # time prepopsitions
-            (r"(by|By|beside|Beside)$", "IN"),  # space prepopsitions
-            (r"(under|Under|below|Below)$", "IN"),  # space prepopsitions
-            (r"(over|Over|above|Above)$", "IN"),  # space prepopsitions
-            (r"(across|Across|through|Through)$", "IN"),  # space prepopsitions
-            (r"(into|Into|towards|Towards)$", "IN"),  # space prepopsitions
-            (r"(onto|Onto|from|From)$", "IN"),  # space prepopsitions
-            (r".*able$", "JJ"),  # adjectives
-            (r".*ness$", "NN"),  # nouns formed from adjectives
-            (r".*ly$", "RB"),  # adverbs
-            (r".*s$", "NNS"),  # plural nouns
-            (r".*ing$", "VBG"),  # gerunds
-            (r".*ed$", "VBD"),  # past tense verbs
-            (r".*", "NN"),  # nouns (default)
+            (r'\.$', '.'),
+            (r'\,$', ','),
+            (r'\?$', '?'),  # fullstop, comma, Qmark
+            (r'\($', '('),
+            (r'\)$', ')'),  # round brackets
+            (r'\[$', '['),
+            (r'\]$', ']'),  # square brackets
+            (r'^-?[0-9]+(.[0-9]+)?$', 'CD'),  # cardinal numbers
+            (r'(The|the|A|a|An|an)$', 'DT'),  # articles
+            (r'(He|he|She|she|It|it|I|me|Me|You|you)$', 'PRP'),  # pronouns
+            (r'(His|his|Her|her|Its|its)$', 'PRP$'),  # possesive
+            (r'(my|Your|your|Yours|yours)$', 'PRP$'),  # possesive
+            (r'(on|On|in|In|at|At|since|Since)$', 'IN'),  # time prepopsitions
+            (r'(for|For|ago|Ago|before|Before)$', 'IN'),  # time prepopsitions
+            (r'(till|Till|until|Until)$', 'IN'),  # time prepopsitions
+            (r'(by|By|beside|Beside)$', 'IN'),  # space prepopsitions
+            (r'(under|Under|below|Below)$', 'IN'),  # space prepopsitions
+            (r'(over|Over|above|Above)$', 'IN'),  # space prepopsitions
+            (r'(across|Across|through|Through)$', 'IN'),  # space prepopsitions
+            (r'(into|Into|towards|Towards)$', 'IN'),  # space prepopsitions
+            (r'(onto|Onto|from|From)$', 'IN'),  # space prepopsitions
+            (r'.*able$', 'JJ'),  # adjectives
+            (r'.*ness$', 'NN'),  # nouns formed from adjectives
+            (r'.*ly$', 'RB'),  # adverbs
+            (r'.*s$', 'NNS'),  # plural nouns
+            (r'.*ing$', 'VBG'),  # gerunds
+            (r'.*ed$', 'VBD'),  # past tense verbs
+            (r'.*', 'NN'),  # nouns (default)
         ]
     )
     return _tagger.tag
@@ -67,16 +71,16 @@ def find_maltparser(parser_dirname):
     if os.path.exists(parser_dirname):  # If a full path is given.
         _malt_dir = parser_dirname
     else:  # Try to find path to maltparser directory in environment variables.
-        _malt_dir = find_dir(parser_dirname, env_vars=("MALT_PARSER",))
+        _malt_dir = find_dir(parser_dirname, env_vars=('MALT_PARSER',))
     # Checks that that the found directory contains all the necessary .jar
-    malt_dependencies = ["", "", ""]
+    malt_dependencies = ['', '', '']
     _malt_jars = set(find_jars_within_path(_malt_dir))
     _jars = set(os.path.split(jar)[1] for jar in _malt_jars)
-    malt_dependencies = set(["log4j.jar", "libsvm.jar", "liblinear-1.8.jar"])
+    malt_dependencies = set(['log4j.jar', 'libsvm.jar', 'liblinear-1.8.jar'])
 
     assert malt_dependencies.issubset(_jars)
     assert any(
-        filter(lambda i: i.startswith("maltparser-") and i.endswith(".jar"), _jars)
+        filter(lambda i: i.startswith('maltparser-') and i.endswith('.jar'), _jars)
     )
     return list(_malt_jars)
 
@@ -86,11 +90,11 @@ def find_malt_model(model_filename):
     A module to find pre-trained MaltParser model.
     """
     if model_filename is None:
-        return "malt_temp.mco"
+        return 'malt_temp.mco'
     elif os.path.exists(model_filename):  # If a full path is given.
         return model_filename
     else:  # Try to find path to malt model in environment variables.
-        return find_file(model_filename, env_vars=("MALT_MODEL",), verbose=False)
+        return find_file(model_filename, env_vars=('MALT_MODEL',), verbose=False)
 
 
 class MaltParser(ParserI):
@@ -149,13 +153,13 @@ class MaltParser(ParserI):
         )
         # Initialize model.
         self.model = find_malt_model(model_filename)
-        self._trained = self.model != "malt_temp.mco"
+        self._trained = self.model != 'malt_temp.mco'
         # Set the working_dir parameters i.e. `-w` from MaltParser's option.
         self.working_dir = tempfile.gettempdir()
         # Initialize POS tagger.
         self.tagger = tagger if tagger is not None else malt_regex_tagger()
 
-    def parse_tagged_sents(self, sentences, verbose=False, top_relation_label="null"):
+    def parse_tagged_sents(self, sentences, verbose=False, top_relation_label='null'):
         """
         Use MaltParser to parse multiple POS tagged sentences. Takes multiple
         sentences where each sentence is a list of (word, tag) tuples.
@@ -170,17 +174,17 @@ class MaltParser(ParserI):
             raise Exception("Parser has not been trained. Call train() first.")
 
         with tempfile.NamedTemporaryFile(
-            prefix="malt_input.conll.", dir=self.working_dir, mode="w", delete=False
+            prefix='malt_input.conll.', dir=self.working_dir, mode='w', delete=False
         ) as input_file:
             with tempfile.NamedTemporaryFile(
-                prefix="malt_output.conll.",
+                prefix='malt_output.conll.',
                 dir=self.working_dir,
-                mode="w",
+                mode='w',
                 delete=False,
             ) as output_file:
                 # Convert list of sentences to CONLL format.
                 for line in taggedsents_to_conll(sentences):
-                    input_file.write(str(line))
+                    input_file.write(text_type(line))
                 input_file.close()
 
                 # Generate command to run maltparser.
@@ -199,15 +203,15 @@ class MaltParser(ParserI):
                 ret = self._execute(cmd, verbose)  # Run command.
                 os.chdir(_current_path)  # Change back to current path.
 
-                if ret != 0:
+                if ret is not 0:
                     raise Exception(
                         "MaltParser parsing (%s) failed with exit "
-                        "code %d" % (" ".join(cmd), ret)
+                        "code %d" % (' '.join(cmd), ret)
                     )
 
                 # Must return iter(iter(Tree))
                 with open(output_file.name) as infile:
-                    for tree_str in infile.read().split("\n\n"):
+                    for tree_str in infile.read().split('\n\n'):
                         yield (
                             iter(
                                 [
@@ -221,7 +225,7 @@ class MaltParser(ParserI):
         os.remove(input_file.name)
         os.remove(output_file.name)
 
-    def parse_sents(self, sentences, verbose=False, top_relation_label="null"):
+    def parse_sents(self, sentences, verbose=False, top_relation_label='null'):
         """
         Use MaltParser to parse multiple sentences.
         Takes a list of sentences, where each sentence is a list of words.
@@ -247,26 +251,26 @@ class MaltParser(ParserI):
         :type outputfilename: str
         """
 
-        cmd = ["java"]
+        cmd = ['java']
         cmd += self.additional_java_args  # Adds additional java arguments
         # Joins classpaths with ";" if on Windows and on Linux/Mac use ":"
-        classpaths_separator = ";" if sys.platform.startswith("win") else ":"
+        classpaths_separator = ';' if sys.platform.startswith('win') else ':'
         cmd += [
-            "-cp",
+            '-cp',
             classpaths_separator.join(self.malt_jars),
         ]  # Adds classpaths for jars
-        cmd += ["org.maltparser.Malt"]  # Adds the main function.
+        cmd += ['org.maltparser.Malt']  # Adds the main function.
 
         # Adds the model file.
         if os.path.exists(self.model):  # when parsing
-            cmd += ["-c", os.path.split(self.model)[-1]]
+            cmd += ['-c', os.path.split(self.model)[-1]]
         else:  # when learning
-            cmd += ["-c", self.model]
+            cmd += ['-c', self.model]
 
-        cmd += ["-i", inputfilename]
-        if mode == "parse":
-            cmd += ["-o", outputfilename]
-        cmd += ["-m", mode]  # mode use to generate parses.
+        cmd += ['-i', inputfilename]
+        if mode == 'parse':
+            cmd += ['-o', outputfilename]
+        cmd += ['-m', mode]  # mode use to generate parses.
         return cmd
 
     @staticmethod
@@ -285,10 +289,10 @@ class MaltParser(ParserI):
 
         # Write the conll_str to malt_train.conll file in /tmp/
         with tempfile.NamedTemporaryFile(
-            prefix="malt_train.conll.", dir=self.working_dir, mode="w", delete=False
+            prefix='malt_train.conll.', dir=self.working_dir, mode='w', delete=False
         ) as input_file:
-            input_str = "\n".join(dg.to_conll(10) for dg in depgraphs)
-            input_file.write(str(input_str))
+            input_str = '\n'.join(dg.to_conll(10) for dg in depgraphs)
+            input_file.write(text_type(input_str))
         # Trains the model with the malt_train.conll
         self.train_from_file(input_file.name, verbose=verbose)
         # Removes the malt_train.conll once training finishes.
@@ -305,11 +309,11 @@ class MaltParser(ParserI):
         # then we need to do some extra massaging
         if isinstance(conll_file, ZipFilePathPointer):
             with tempfile.NamedTemporaryFile(
-                prefix="malt_train.conll.", dir=self.working_dir, mode="w", delete=False
+                prefix='malt_train.conll.', dir=self.working_dir, mode='w', delete=False
             ) as input_file:
                 with conll_file.open() as conll_input_file:
                     conll_str = conll_input_file.read()
-                    input_file.write(str(conll_str))
+                    input_file.write(text_type(conll_str))
                 return self.train_from_file(input_file.name, verbose=verbose)
 
         # Generate command to run maltparser.
@@ -318,14 +322,14 @@ class MaltParser(ParserI):
         if ret != 0:
             raise Exception(
                 "MaltParser training (%s) failed with exit "
-                "code %d" % (" ".join(cmd), ret)
+                "code %d" % (' '.join(cmd), ret)
             )
         self._trained = True
 
 
 if __name__ == '__main__':
-    """
-    A demonstration function to show how NLTK users can use the malt parser API.
+    '''
+    A demostration function to show how NLTK users can use the malt parser API.
 
     >>> from nltk import pos_tag
     >>> assert 'MALT_PARSER' in os.environ, str(
@@ -360,9 +364,9 @@ if __name__ == '__main__':
     >>> # Parse a single sentence.
     >>> parsed_sent1 = mp.parse_one(sent1)
     >>> parsed_sent2 = mp.parse_one(sent2)
-    >>> print(parsed_sent1.tree())
+    >>> print (parsed_sent1.tree())
     (sees John Mary .)
-    >>> print(parsed_sent2.tree())
+    >>> print (parsed_sent2.tree())
     (walks John (dog a) .)
     >>>
     >>> # Parsing multiple sentences.
@@ -389,7 +393,7 @@ if __name__ == '__main__':
     (shot I (elephant an) (in (pajamas my)) .)
     >>> print(next(next(parsed_sents)).tree())
     (flies Time (like banana) .)
-    """
-
+    '''
     import doctest
+
     doctest.testmod()
index 2b901dd..9b8bddc 100644 (file)
@@ -1,15 +1,18 @@
 # Natural Language Toolkit: Dependency Grammars
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Jason Narad <jason.narad@gmail.com>
 #
 # URL: <http://nltk.org/>
 # For license information, see LICENSE.TXT
 #
+from __future__ import print_function
 
 import math
 import logging
 
+from six.moves import range
+
 from nltk.parse.dependencygraph import DependencyGraph
 
 logger = logging.getLogger(__name__)
@@ -33,7 +36,7 @@ class DependencyScorerI(object):
 
     def __init__(self):
         if self.__class__ == DependencyScorerI:
-            raise TypeError("DependencyScorerI is an abstract interface")
+            raise TypeError('DependencyScorerI is an abstract interface')
 
     def train(self, graphs):
         """
@@ -115,17 +118,17 @@ class NaiveBayesDependencyScorer(DependencyScorerI):
         for graph in graphs:
             for head_node in graph.nodes.values():
                 for child_index, child_node in graph.nodes.items():
-                    if child_index in head_node["deps"]:
+                    if child_index in head_node['deps']:
                         label = "T"
                     else:
                         label = "F"
                     labeled_examples.append(
                         (
                             dict(
-                                a=head_node["word"],
-                                b=head_node["tag"],
-                                c=child_node["word"],
-                                d=child_node["tag"],
+                                a=head_node['word'],
+                                b=head_node['tag'],
+                                c=child_node['word'],
+                                d=child_node['tag'],
                             ),
                             label,
                         )
@@ -152,10 +155,10 @@ class NaiveBayesDependencyScorer(DependencyScorerI):
                 edges.append(
                     (
                         dict(
-                            a=head_node["word"],
-                            b=head_node["tag"],
-                            c=child_node["word"],
-                            d=child_node["tag"],
+                            a=head_node['word'],
+                            b=head_node['tag'],
+                            c=child_node['word'],
+                            d=child_node['tag'],
                         )
                     )
                 )
@@ -165,7 +168,7 @@ class NaiveBayesDependencyScorer(DependencyScorerI):
         row = []
         count = 0
         for pdist in self.classifier.prob_classify_many(edges):
-            logger.debug("%.4f %.4f", pdist.prob("T"), pdist.prob("F"))
+            logger.debug('%.4f %.4f', pdist.prob('T'), pdist.prob('F'))
             # smoothing in case the probability = 0
             row.append([math.log(pdist.prob("T") + 0.00000000001)])
             count += 1
@@ -182,7 +185,7 @@ class NaiveBayesDependencyScorer(DependencyScorerI):
 # A short class necessary to show parsing example from paper
 class DemoScorer(DependencyScorerI):
     def train(self, graphs):
-        print("Training...")
+        print('Training...')
 
     def score(self, graph):
         # scores for Keith Hall 'K-best Spanning Tree Parsing' paper
@@ -254,7 +257,7 @@ class ProbabilisticNonprojectiveParser(object):
         """
         Creates a new non-projective parser.
         """
-        logging.debug("initializing prob. nonprojective...")
+        logging.debug('initializing prob. nonprojective...')
 
     def train(self, graphs, dependency_scorer):
         """
@@ -296,12 +299,12 @@ class ProbabilisticNonprojectiveParser(object):
         :type g_graph, b_graph, c_graph: DependencyGraph
         :param g_graph, b_graph, c_graph: Graphs which need to be updated.
         """
-        logger.debug("Collapsing nodes...")
+        logger.debug('Collapsing nodes...')
         # Collapse all cycle nodes into v_n+1 in G_Graph
         for cycle_node_index in cycle_path:
             g_graph.remove_by_address(cycle_node_index)
         g_graph.add_node(new_node)
-        g_graph.redirect_arcs(cycle_path, new_node["address"])
+        g_graph.redirect_arcs(cycle_path, new_node['address'])
 
     def update_edge_scores(self, new_node, cycle_path):
         """
@@ -313,12 +316,12 @@ class ProbabilisticNonprojectiveParser(object):
         :type cycle_path: A list of integers.
         :param cycle_path: A list of node addresses that belong to the cycle.
         """
-        logger.debug("cycle %s", cycle_path)
+        logger.debug('cycle %s', cycle_path)
 
         cycle_path = self.compute_original_indexes(cycle_path)
 
-        logger.debug("old cycle %s", cycle_path)
-        logger.debug("Prior to update: %s", self.scores)
+        logger.debug('old cycle %s', cycle_path)
+        logger.debug('Prior to update: %s', self.scores)
 
         for i, row in enumerate(self.scores):
             for j, column in enumerate(self.scores[i]):
@@ -326,7 +329,7 @@ class ProbabilisticNonprojectiveParser(object):
                 if j in cycle_path and i not in cycle_path and self.scores[i][j]:
                     subtract_val = self.compute_max_subtract_score(j, cycle_path)
 
-                    logger.debug("%s - %s", self.scores[i][j], subtract_val)
+                    logger.debug('%s - %s', self.scores[i][j], subtract_val)
 
                     new_vals = []
                     for cur_val in self.scores[i][j]:
@@ -339,7 +342,7 @@ class ProbabilisticNonprojectiveParser(object):
                 if i in cycle_path and j in cycle_path:
                     self.scores[i][j] = []
 
-        logger.debug("After update: %s", self.scores)
+        logger.debug('After update: %s', self.scores)
 
     def compute_original_indexes(self, new_indexes):
         """
@@ -398,18 +401,19 @@ class ProbabilisticNonprojectiveParser(object):
         the node that is arced to.
         """
         originals = self.compute_original_indexes([node_index])
-        logger.debug("originals: %s", originals)
+        logger.debug('originals: %s', originals)
 
         max_arc = None
         max_score = None
         for row_index in range(len(self.scores)):
             for col_index in range(len(self.scores[row_index])):
+                # print self.scores[row_index][col_index]
                 if col_index in originals and (
                     max_score is None or self.scores[row_index][col_index] > max_score
                 ):
                     max_score = self.scores[row_index][col_index]
                     max_arc = row_index
-                    logger.debug("%s, %s", row_index, col_index)
+                    logger.debug('%s, %s', row_index, col_index)
 
         logger.debug(max_score)
 
@@ -456,15 +460,16 @@ class ProbabilisticNonprojectiveParser(object):
         g_graph = DependencyGraph()
         for index, token in enumerate(tokens):
             g_graph.nodes[index + 1].update(
-                {"word": token, "tag": tags[index], "rel": "NTOP", "address": index + 1}
+                {'word': token, 'tag': tags[index], 'rel': 'NTOP', 'address': index + 1}
             )
+        # print (g_graph.nodes)
 
         # Fully connect non-root nodes in g_graph
         g_graph.connect_graph()
         original_graph = DependencyGraph()
         for index, token in enumerate(tokens):
             original_graph.nodes[index + 1].update(
-                {"word": token, "tag": tags[index], "rel": "NTOP", "address": index + 1}
+                {'word': token, 'tag': tags[index], 'rel': 'NTOP', 'address': index + 1}
             )
 
         b_graph = DependencyGraph()
@@ -472,32 +477,32 @@ class ProbabilisticNonprojectiveParser(object):
 
         for index, token in enumerate(tokens):
             c_graph.nodes[index + 1].update(
-                {"word": token, "tag": tags[index], "rel": "NTOP", "address": index + 1}
+                {'word': token, 'tag': tags[index], 'rel': 'NTOP', 'address': index + 1}
             )
 
         # Assign initial scores to g_graph edges
         self.initialize_edge_scores(g_graph)
         logger.debug(self.scores)
         # Initialize a list of unvisited vertices (by node address)
-        unvisited_vertices = [vertex["address"] for vertex in c_graph.nodes.values()]
+        unvisited_vertices = [vertex['address'] for vertex in c_graph.nodes.values()]
         # Iterate over unvisited vertices
         nr_vertices = len(tokens)
         betas = {}
         while unvisited_vertices:
             # Mark current node as visited
             current_vertex = unvisited_vertices.pop(0)
-            logger.debug("current_vertex: %s", current_vertex)
+            logger.debug('current_vertex: %s', current_vertex)
             # Get corresponding node n_i to vertex v_i
             current_node = g_graph.get_by_address(current_vertex)
-            logger.debug("current_node: %s", current_node)
+            logger.debug('current_node: %s', current_node)
             # Get best in-edge node b for current node
             best_in_edge = self.best_incoming_arc(current_vertex)
             betas[current_vertex] = self.original_best_arc(current_vertex)
-            logger.debug("best in arc: %s --> %s", best_in_edge, current_vertex)
+            logger.debug('best in arc: %s --> %s', best_in_edge, current_vertex)
             # b_graph = Union(b_graph, b)
             for new_vertex in [current_vertex, best_in_edge]:
                 b_graph.nodes[new_vertex].update(
-                    {"word": "TEMP", "rel": "NTOP", "address": new_vertex}
+                    {'word': 'TEMP', 'rel': 'NTOP', 'address': new_vertex}
                 )
             b_graph.add_arc(best_in_edge, current_vertex)
             # Beta(current node) = b  - stored for parse recovery
@@ -505,17 +510,17 @@ class ProbabilisticNonprojectiveParser(object):
             cycle_path = b_graph.contains_cycle()
             if cycle_path:
                 # Create a new node v_n+1 with address = len(nodes) + 1
-                new_node = {"word": "NONE", "rel": "NTOP", "address": nr_vertices + 1}
+                new_node = {'word': 'NONE', 'rel': 'NTOP', 'address': nr_vertices + 1}
                 # c_graph = Union(c_graph, v_n+1)
                 c_graph.add_node(new_node)
                 # Collapse all nodes in cycle C into v_n+1
                 self.update_edge_scores(new_node, cycle_path)
                 self.collapse_nodes(new_node, cycle_path, g_graph, b_graph, c_graph)
                 for cycle_index in cycle_path:
-                    c_graph.add_arc(new_node["address"], cycle_index)
+                    c_graph.add_arc(new_node['address'], cycle_index)
                     # self.replaced_by[cycle_index] = new_node['address']
 
-                self.inner_nodes[new_node["address"]] = cycle_path
+                self.inner_nodes[new_node['address']] = cycle_path
 
                 # Add v_n+1 to list of unvisited vertices
                 unvisited_vertices.insert(0, nr_vertices + 1)
@@ -527,30 +532,30 @@ class ProbabilisticNonprojectiveParser(object):
                 for cycle_node_address in cycle_path:
                     b_graph.remove_by_address(cycle_node_address)
 
-            logger.debug("g_graph: %s", g_graph)
-            logger.debug("b_graph: %s", b_graph)
-            logger.debug("c_graph: %s", c_graph)
-            logger.debug("Betas: %s", betas)
-            logger.debug("replaced nodes %s", self.inner_nodes)
+            logger.debug('g_graph: %s', g_graph)
+            logger.debug('b_graph: %s', b_graph)
+            logger.debug('c_graph: %s', c_graph)
+            logger.debug('Betas: %s', betas)
+            logger.debug('replaced nodes %s', self.inner_nodes)
 
         # Recover parse tree
-        logger.debug("Final scores: %s", self.scores)
+        logger.debug('Final scores: %s', self.scores)
 
-        logger.debug("Recovering parse...")
+        logger.debug('Recovering parse...')
         for i in range(len(tokens) + 1, nr_vertices + 1):
             betas[betas[i][1]] = betas[i]
 
-        logger.debug("Betas: %s", betas)
+        logger.debug('Betas: %s', betas)
         for node in original_graph.nodes.values():
             # TODO: It's dangerous to assume that deps it a dictionary
             # because it's a default dictionary. Ideally, here we should not
             # be concerned how dependencies are stored inside of a dependency
             # graph.
-            node["deps"] = {}
+            node['deps'] = {}
         for i in range(1, len(tokens) + 1):
             original_graph.add_arc(betas[i][0], betas[i][1])
 
-        logger.debug("Done.")
+        logger.debug('Done.')
         yield original_graph
 
 
@@ -598,21 +603,21 @@ class NonprojectiveDependencyParser(object):
 
         for index, token in enumerate(tokens):
             self._graph.nodes[index] = {
-                "word": token,
-                "deps": [],
-                "rel": "NTOP",
-                "address": index,
+                'word': token,
+                'deps': [],
+                'rel': 'NTOP',
+                'address': index,
             }
 
         for head_node in self._graph.nodes.values():
             deps = []
             for dep_node in self._graph.nodes.values():
                 if (
-                    self._grammar.contains(head_node["word"], dep_node["word"])
-                    and head_node["word"] != dep_node["word"]
+                    self._grammar.contains(head_node['word'], dep_node['word'])
+                    and head_node['word'] != dep_node['word']
                 ):
-                    deps.append(dep_node["address"])
-            head_node["deps"] = deps
+                    deps.append(dep_node['address'])
+            head_node['deps'] = deps
 
         # Create lattice of possible heads
         roots = []
@@ -692,13 +697,13 @@ class NonprojectiveDependencyParser(object):
                 head_address = head_index + 1
 
                 node = graph.nodes[address]
-                node.update({"word": token, "address": address})
+                node.update({'word': token, 'address': address})
 
                 if head_address == 0:
-                    rel = "ROOT"
+                    rel = 'ROOT'
                 else:
-                    rel = ""
-                graph.nodes[head_index + 1]["deps"][rel].append(address)
+                    rel = ''
+                graph.nodes[head_index + 1]['deps'][rel].append(address)
 
             # TODO: check for cycles
             yield graph
@@ -718,18 +723,18 @@ def demo():
 def hall_demo():
     npp = ProbabilisticNonprojectiveParser()
     npp.train([], DemoScorer())
-    for parse_graph in npp.parse(["v1", "v2", "v3"], [None, None, None]):
+    for parse_graph in npp.parse(['v1', 'v2', 'v3'], [None, None, None]):
         print(parse_graph)
 
 
 def nonprojective_conll_parse_demo():
     from nltk.parse.dependencygraph import conll_data2
 
-    graphs = [DependencyGraph(entry) for entry in conll_data2.split("\n\n") if entry]
+    graphs = [DependencyGraph(entry) for entry in conll_data2.split('\n\n') if entry]
     npp = ProbabilisticNonprojectiveParser()
     npp.train(graphs, NaiveBayesDependencyScorer())
     for parse_graph in npp.parse(
-        ["Cathy", "zag", "hen", "zwaaien", "."], ["N", "V", "Pron", "Adj", "N", "Punc"]
+        ['Cathy', 'zag', 'hen', 'zwaaien', '.'], ['N', 'V', 'Pron', 'Adj', 'N', 'Punc']
     ):
         print(parse_graph)
 
@@ -751,23 +756,23 @@ def rule_based_demo():
     ndp = NonprojectiveDependencyParser(grammar)
     graphs = ndp.parse(
         [
-            "the",
-            "man",
-            "in",
-            "the",
-            "corner",
-            "taught",
-            "his",
-            "dachshund",
-            "to",
-            "play",
-            "golf",
+            'the',
+            'man',
+            'in',
+            'the',
+            'corner',
+            'taught',
+            'his',
+            'dachshund',
+            'to',
+            'play',
+            'golf',
         ]
     )
-    print("Graphs:")
+    print('Graphs:')
     for graph in graphs:
         print(graph)
 
 
-if __name__ == "__main__":
+if __name__ == '__main__':
     demo()
index 2b14eab..924d9a6 100644 (file)
@@ -1,6 +1,6 @@
 # Natural Language Toolkit: Probabilistic Chart Parsers
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Edward Loper <edloper@gmail.com>
 #         Steven Bird <stevenbird1@gmail.com>
 # URL: <http://nltk.org/>
@@ -29,6 +29,7 @@ The ``BottomUpProbabilisticChartParser`` constructor has an optional
 argument beam_size.  If non-zero, this controls the size of the beam
 (aka the edge queue).  This option is most useful with InsideChartParser.
 """
+from __future__ import print_function, unicode_literals
 
 ##//////////////////////////////////////////////////////
 ##  Bottom-Up PCFG Chart Parser
@@ -44,6 +45,7 @@ from nltk.grammar import Nonterminal, PCFG
 
 from nltk.parse.api import ParserI
 from nltk.parse.chart import Chart, LeafEdge, TreeEdge, AbstractChartRule
+from nltk.compat import python_2_unicode_compatible
 
 # Probabilistic edges
 class ProbabilisticLeafEdge(LeafEdge):
@@ -128,6 +130,7 @@ class ProbabilisticFundamentalRule(AbstractChartRule):
             yield new_edge
 
 
+@python_2_unicode_compatible
 class SingleEdgeProbabilisticFundamentalRule(AbstractChartRule):
     NUM_EDGES = 1
 
@@ -151,7 +154,7 @@ class SingleEdgeProbabilisticFundamentalRule(AbstractChartRule):
                     yield new_edge
 
     def __str__(self):
-        return "Fundamental Rule"
+        return 'Fundamental Rule'
 
 
 class BottomUpProbabilisticChartParser(ParserI):
@@ -236,7 +239,7 @@ class BottomUpProbabilisticChartParser(ParserI):
         for edge in bu_init.apply(chart, grammar):
             if self._trace > 1:
                 print(
-                    "  %-50s [%s]"
+                    '  %-50s [%s]'
                     % (chart.pretty_format_edge(edge, width=2), edge.prob())
                 )
             queue.append(edge)
@@ -253,7 +256,7 @@ class BottomUpProbabilisticChartParser(ParserI):
             edge = queue.pop()
             if self._trace > 0:
                 print(
-                    "  %-50s [%s]"
+                    '  %-50s [%s]'
                     % (chart.pretty_format_edge(edge, width=2), edge.prob())
                 )
 
@@ -322,7 +325,7 @@ class BottomUpProbabilisticChartParser(ParserI):
             split = len(queue) - self.beam_size
             if self._trace > 2:
                 for edge in queue[:split]:
-                    print("  %-50s [DISCARDED]" % chart.pretty_format_edge(edge, 2))
+                    print('  %-50s [DISCARDED]' % chart.pretty_format_edge(edge, 2))
             del queue[:split]
 
 
@@ -380,7 +383,7 @@ class InsideChartParser(BottomUpProbabilisticChartParser):
 #                                      bestp.get(elt,0))
 #
 #         self._bestp = bestp
-#         for (k,v) in self._bestp.items(): print(k,v)
+#         for (k,v) in self._bestp.items(): print k,v
 #
 #     def _sortkey(self, edge):
 #         return edge.structure()[PROB] * self._bestp[edge.lhs()]
@@ -482,23 +485,23 @@ def demo(choice=None, draw_parses=None, print_parses=None):
     )
 
     demos = [
-        ("I saw John with my telescope", toy_pcfg1),
-        ("the boy saw Jack with Bob under the table with a telescope", toy_pcfg2),
+        ('I saw John with my telescope', toy_pcfg1),
+        ('the boy saw Jack with Bob under the table with a telescope', toy_pcfg2),
     ]
 
     if choice is None:
         # Ask the user which demo they want to use.
         print()
         for i in range(len(demos)):
-            print("%3s: %s" % (i + 1, demos[i][0]))
-            print("     %r" % demos[i][1])
+            print('%3s: %s' % (i + 1, demos[i][0]))
+            print('     %r' % demos[i][1])
             print()
-        print("Which demo (%d-%d)? " % (1, len(demos)), end=" ")
+        print('Which demo (%d-%d)? ' % (1, len(demos)), end=' ')
         choice = int(sys.stdin.readline().strip()) - 1
     try:
         sent, grammar = demos[choice]
     except:
-        print("Bad sentence number")
+        print('Bad sentence number')
         return
 
     # Tokenize the sentence.
@@ -519,7 +522,7 @@ def demo(choice=None, draw_parses=None, print_parses=None):
     num_parses = []
     all_parses = {}
     for parser in parsers:
-        print("\ns: %s\nparser: %s\ngrammar: %s" % (sent, parser, grammar))
+        print('\ns: %s\nparser: %s\ngrammar: %s' % (sent, parser, grammar))
         parser.trace(3)
         t = time.time()
         parses = list(parser.parse(tokens))
@@ -532,11 +535,11 @@ def demo(choice=None, draw_parses=None, print_parses=None):
 
     # Print some summary statistics
     print()
-    print("       Parser      Beam | Time (secs)   # Parses   Average P(parse)")
-    print("------------------------+------------------------------------------")
+    print('       Parser      Beam | Time (secs)   # Parses   Average P(parse)')
+    print('------------------------+------------------------------------------')
     for i in range(len(parsers)):
         print(
-            "%18s %4d |%11.4f%11d%19.14f"
+            '%18s %4d |%11.4f%11d%19.14f'
             % (
                 parsers[i].__class__.__name__,
                 parsers[i].beam_size,
@@ -550,29 +553,29 @@ def demo(choice=None, draw_parses=None, print_parses=None):
         p = reduce(lambda a, b: a + b.prob(), parses, 0) / len(parses)
     else:
         p = 0
-    print("------------------------+------------------------------------------")
-    print("%18s      |%11s%11d%19.14f" % ("(All Parses)", "n/a", len(parses), p))
+    print('------------------------+------------------------------------------')
+    print('%18s      |%11s%11d%19.14f' % ('(All Parses)', 'n/a', len(parses), p))
 
     if draw_parses is None:
         # Ask the user if we should draw the parses.
         print()
-        print("Draw parses (y/n)? ", end=" ")
-        draw_parses = sys.stdin.readline().strip().lower().startswith("y")
+        print('Draw parses (y/n)? ', end=' ')
+        draw_parses = sys.stdin.readline().strip().lower().startswith('y')
     if draw_parses:
         from nltk.draw.tree import draw_trees
 
-        print("  please wait...")
+        print('  please wait...')
         draw_trees(*parses)
 
     if print_parses is None:
         # Ask the user if we should print the parses.
         print()
-        print("Print parses (y/n)? ", end=" ")
-        print_parses = sys.stdin.readline().strip().lower().startswith("y")
+        print('Print parses (y/n)? ', end=' ')
+        print_parses = sys.stdin.readline().strip().lower().startswith('y')
     if print_parses:
         for parse in parses:
             print(parse)
 
 
-if __name__ == "__main__":
+if __name__ == '__main__':
     demo()
index b4d56cf..d29ee8c 100644 (file)
@@ -1,11 +1,12 @@
 # Natural Language Toolkit: Dependency Grammars
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Jason Narad <jason.narad@gmail.com>
 #
 # URL: <http://nltk.org/>
 # For license information, see LICENSE.TXT
 #
+from __future__ import print_function, unicode_literals
 
 from collections import defaultdict
 from itertools import chain
@@ -18,7 +19,7 @@ from nltk.grammar import (
 )
 from nltk.parse.dependencygraph import DependencyGraph
 from nltk.internals import raise_unorderable_types
-
+from nltk.compat import python_2_unicode_compatible
 
 #################################################################
 # Dependency Span
@@ -26,6 +27,7 @@ from nltk.internals import raise_unorderable_types
 
 
 @total_ordering
+@python_2_unicode_compatible
 class DependencySpan(object):
     """
     A contiguous span over some part of the input string representing
@@ -59,7 +61,7 @@ class DependencySpan(object):
         :return: A concise string representatino of the ``DependencySpan``.
         :rtype: str.
         """
-        return "Span %d-%d; Head Index: %d" % (
+        return 'Span %d-%d; Head Index: %d' % (
             self._start_index,
             self._end_index,
             self._head_index,
@@ -70,13 +72,13 @@ class DependencySpan(object):
         :return: A verbose string representation of the ``DependencySpan``.
         :rtype: str
         """
-        str = "Span %d-%d; Head Index: %d" % (
+        str = 'Span %d-%d; Head Index: %d' % (
             self._start_index,
             self._end_index,
             self._head_index,
         )
         for i in range(len(self._arcs)):
-            str += "\n%d <- %d, %s" % (i, self._arcs[i], self._tags[i])
+            str += '\n%d <- %d, %s' % (i, self._arcs[i], self._tags[i])
         return str
 
     def __eq__(self, other):
@@ -104,6 +106,7 @@ class DependencySpan(object):
 #################################################################
 
 
+@python_2_unicode_compatible
 class ChartCell(object):
     """
     A cell from the parse chart formed when performing the CYK algorithm.
@@ -137,14 +140,14 @@ class ChartCell(object):
         :return: A verbose string representation of this ``ChartCell``.
         :rtype: str.
         """
-        return "CC[%d,%d]: %s" % (self._x, self._y, self._entries)
+        return 'CC[%d,%d]: %s' % (self._x, self._y, self._entries)
 
     def __repr__(self):
         """
         :return: A concise string representation of this ``ChartCell``.
         :rtype: str.
         """
-        return "%s" % self
+        return '%s' % self
 
 
 #################################################################
@@ -192,7 +195,7 @@ class ProjectiveDependencyParser(object):
             for j in range(0, len(self._tokens) + 1):
                 chart[i].append(ChartCell(i, j))
                 if i == j + 1:
-                    chart[i][j].add(DependencySpan(i - 1, i, i - 1, [-1], ["null"]))
+                    chart[i][j].add(DependencySpan(i - 1, i, i - 1, [-1], ['null']))
 
         for i in range(1, len(self._tokens) + 1):
             for j in range(i - 2, -1, -1):
@@ -209,17 +212,17 @@ class ProjectiveDependencyParser(object):
                 #                malt_format += '%s\t%s\t%d\t%s\n' % (tokens[i], 'null', parse._arcs[i] + 1, 'null')
                 # conll_format += '\t%d\t%s\t%s\t%s\t%s\t%s\t%d\t%s\t%s\t%s\n' % (i+1, tokens[i], tokens[i], 'null', 'null', 'null', parse._arcs[i] + 1, 'null', '-', '-')
                 # Modify to comply with the new Dependency Graph requirement (at least must have an root elements)
-                conll_format += "\t%d\t%s\t%s\t%s\t%s\t%s\t%d\t%s\t%s\t%s\n" % (
+                conll_format += '\t%d\t%s\t%s\t%s\t%s\t%s\t%d\t%s\t%s\t%s\n' % (
                     i + 1,
                     tokens[i],
                     tokens[i],
-                    "null",
-                    "null",
-                    "null",
+                    'null',
+                    'null',
+                    'null',
                     parse._arcs[i] + 1,
-                    "ROOT",
-                    "-",
-                    "-",
+                    'ROOT',
+                    '-',
+                    '-',
                 )
             dg = DependencyGraph(conll_format)
             #           if self.meets_arity(dg):
@@ -239,7 +242,7 @@ class ProjectiveDependencyParser(object):
         """
         spans = []
         if span1._start_index == span2._start_index:
-            print("Error: Mismatched spans - replace this with thrown error")
+            print('Error: Mismatched spans - replace this with thrown error')
         if span1._start_index > span2._start_index:
             temp_span = span1
             span1 = span2
@@ -250,7 +253,7 @@ class ProjectiveDependencyParser(object):
         if self._grammar.contains(
             self._tokens[span1._head_index], self._tokens[span2._head_index]
         ):
-            #           print('Performing rightward cover %d to %d' % (span1._head_index, span2._head_index))
+            #           print 'Performing rightward cover %d to %d' % (span1._head_index, span2._head_index)
             new_arcs[span2._head_index - span1._start_index] = span1._head_index
             spans.append(
                 DependencySpan(
@@ -266,7 +269,7 @@ class ProjectiveDependencyParser(object):
         if self._grammar.contains(
             self._tokens[span2._head_index], self._tokens[span1._head_index]
         ):
-            #           print('performing leftward cover %d to %d' % (span2._head_index, span1._head_index))
+            #           print 'performing leftward cover %d to %d' % (span2._head_index, span1._head_index)
             new_arcs[span1._head_index - span1._start_index] = span2._head_index
             spans.append(
                 DependencySpan(
@@ -340,7 +343,7 @@ class ProbabilisticProjectiveDependencyParser(object):
                             )
                     else:
                         print(
-                            "No tag found for input token '%s', parse is impossible."
+                            'No tag found for input token \'%s\', parse is impossible.'
                             % tokens[i - 1]
                         )
                         return []
@@ -358,25 +361,25 @@ class ProbabilisticProjectiveDependencyParser(object):
             conll_format = ""
             malt_format = ""
             for i in range(len(tokens)):
-                malt_format += "%s\t%s\t%d\t%s\n" % (
+                malt_format += '%s\t%s\t%d\t%s\n' % (
                     tokens[i],
-                    "null",
+                    'null',
                     parse._arcs[i] + 1,
-                    "null",
+                    'null',
                 )
                 # conll_format += '\t%d\t%s\t%s\t%s\t%s\t%s\t%d\t%s\t%s\t%s\n' % (i+1, tokens[i], tokens[i], parse._tags[i], parse._tags[i], 'null', parse._arcs[i] + 1, 'null', '-', '-')
                 # Modify to comply with recent change in dependency graph such that there must be a ROOT element.
-                conll_format += "\t%d\t%s\t%s\t%s\t%s\t%s\t%d\t%s\t%s\t%s\n" % (
+                conll_format += '\t%d\t%s\t%s\t%s\t%s\t%s\t%d\t%s\t%s\t%s\n' % (
                     i + 1,
                     tokens[i],
                     tokens[i],
                     parse._tags[i],
                     parse._tags[i],
-                    "null",
+                    'null',
                     parse._arcs[i] + 1,
-                    "ROOT",
-                    "-",
-                    "-",
+                    'ROOT',
+                    '-',
+                    '-',
                 )
             dg = DependencyGraph(conll_format)
             score = self.compute_prob(dg)
@@ -398,7 +401,7 @@ class ProbabilisticProjectiveDependencyParser(object):
         """
         spans = []
         if span1._start_index == span2._start_index:
-            print("Error: Mismatched spans - replace this with thrown error")
+            print('Error: Mismatched spans - replace this with thrown error')
         if span1._start_index > span2._start_index:
             temp_span = span1
             span1 = span2
@@ -453,7 +456,7 @@ class ProbabilisticProjectiveDependencyParser(object):
         for dg in graphs:
             for node_index in range(1, len(dg.nodes)):
                 # children = dg.nodes[node_index]['deps']
-                children = list(chain(*dg.nodes[node_index]["deps"].values()))
+                children = list(chain(*dg.nodes[node_index]['deps'].values()))
 
                 nr_left_children = dg.left_children(node_index)
                 nr_right_children = dg.right_children(node_index)
@@ -461,34 +464,34 @@ class ProbabilisticProjectiveDependencyParser(object):
                 for child_index in range(
                     0 - (nr_left_children + 1), nr_right_children + 2
                 ):
-                    head_word = dg.nodes[node_index]["word"]
-                    head_tag = dg.nodes[node_index]["tag"]
+                    head_word = dg.nodes[node_index]['word']
+                    head_tag = dg.nodes[node_index]['tag']
                     if head_word in tags:
                         tags[head_word].add(head_tag)
                     else:
                         tags[head_word] = set([head_tag])
-                    child = "STOP"
-                    child_tag = "STOP"
-                    prev_word = "START"
-                    prev_tag = "START"
+                    child = 'STOP'
+                    child_tag = 'STOP'
+                    prev_word = 'START'
+                    prev_tag = 'START'
                     if child_index < 0:
                         array_index = child_index + nr_left_children
                         if array_index >= 0:
-                            child = dg.nodes[children[array_index]]["word"]
-                            child_tag = dg.nodes[children[array_index]]["tag"]
+                            child = dg.nodes[children[array_index]]['word']
+                            child_tag = dg.nodes[children[array_index]]['tag']
                         if child_index != -1:
-                            prev_word = dg.nodes[children[array_index + 1]]["word"]
-                            prev_tag = dg.nodes[children[array_index + 1]]["tag"]
-                        if child != "STOP":
+                            prev_word = dg.nodes[children[array_index + 1]]['word']
+                            prev_tag = dg.nodes[children[array_index + 1]]['tag']
+                        if child != 'STOP':
                             productions.append(DependencyProduction(head_word, [child]))
-                        head_event = "(head (%s %s) (mods (%s, %s, %s) left))" % (
+                        head_event = '(head (%s %s) (mods (%s, %s, %s) left))' % (
                             child,
                             child_tag,
                             prev_tag,
                             head_word,
                             head_tag,
                         )
-                        mod_event = "(mods (%s, %s, %s) left))" % (
+                        mod_event = '(mods (%s, %s, %s) left))' % (
                             prev_tag,
                             head_word,
                             head_tag,
@@ -498,21 +501,21 @@ class ProbabilisticProjectiveDependencyParser(object):
                     elif child_index > 0:
                         array_index = child_index + nr_left_children - 1
                         if array_index < nr_children:
-                            child = dg.nodes[children[array_index]]["word"]
-                            child_tag = dg.nodes[children[array_index]]["tag"]
+                            child = dg.nodes[children[array_index]]['word']
+                            child_tag = dg.nodes[children[array_index]]['tag']
                         if child_index != 1:
-                            prev_word = dg.nodes[children[array_index - 1]]["word"]
-                            prev_tag = dg.nodes[children[array_index - 1]]["tag"]
-                        if child != "STOP":
+                            prev_word = dg.nodes[children[array_index - 1]]['word']
+                            prev_tag = dg.nodes[children[array_index - 1]]['tag']
+                        if child != 'STOP':
                             productions.append(DependencyProduction(head_word, [child]))
-                        head_event = "(head (%s %s) (mods (%s, %s, %s) right))" % (
+                        head_event = '(head (%s %s) (mods (%s, %s, %s) right))' % (
                             child,
                             child_tag,
                             prev_tag,
                             head_word,
                             head_tag,
                         )
-                        mod_event = "(mods (%s, %s, %s) right))" % (
+                        mod_event = '(mods (%s, %s, %s) right))' % (
                             prev_tag,
                             head_word,
                             head_tag,
@@ -535,34 +538,34 @@ class ProbabilisticProjectiveDependencyParser(object):
         prob = 1.0
         for node_index in range(1, len(dg.nodes)):
             # children = dg.nodes[node_index]['deps']
-            children = list(chain(*dg.nodes[node_index]["deps"].values()))
+            children = list(chain(*dg.nodes[node_index]['deps'].values()))
 
             nr_left_children = dg.left_children(node_index)
             nr_right_children = dg.right_children(node_index)
             nr_children = nr_left_children + nr_right_children
             for child_index in range(0 - (nr_left_children + 1), nr_right_children + 2):
-                head_word = dg.nodes[node_index]["word"]
-                head_tag = dg.nodes[node_index]["tag"]
-                child = "STOP"
-                child_tag = "STOP"
-                prev_word = "START"
-                prev_tag = "START"
+                head_word = dg.nodes[node_index]['word']
+                head_tag = dg.nodes[node_index]['tag']
+                child = 'STOP'
+                child_tag = 'STOP'
+                prev_word = 'START'
+                prev_tag = 'START'
                 if child_index < 0:
                     array_index = child_index + nr_left_children
                     if array_index >= 0:
-                        child = dg.nodes[children[array_index]]["word"]
-                        child_tag = dg.nodes[children[array_index]]["tag"]
+                        child = dg.nodes[children[array_index]]['word']
+                        child_tag = dg.nodes[children[array_index]]['tag']
                     if child_index != -1:
-                        prev_word = dg.nodes[children[array_index + 1]]["word"]
-                        prev_tag = dg.nodes[children[array_index + 1]]["tag"]
-                    head_event = "(head (%s %s) (mods (%s, %s, %s) left))" % (
+                        prev_word = dg.nodes[children[array_index + 1]]['word']
+                        prev_tag = dg.nodes[children[array_index + 1]]['tag']
+                    head_event = '(head (%s %s) (mods (%s, %s, %s) left))' % (
                         child,
                         child_tag,
                         prev_tag,
                         head_word,
                         head_tag,
                     )
-                    mod_event = "(mods (%s, %s, %s) left))" % (
+                    mod_event = '(mods (%s, %s, %s) left))' % (
                         prev_tag,
                         head_word,
                         head_tag,
@@ -579,19 +582,19 @@ class ProbabilisticProjectiveDependencyParser(object):
                 elif child_index > 0:
                     array_index = child_index + nr_left_children - 1
                     if array_index < nr_children:
-                        child = dg.nodes[children[array_index]]["word"]
-                        child_tag = dg.nodes[children[array_index]]["tag"]
+                        child = dg.nodes[children[array_index]]['word']
+                        child_tag = dg.nodes[children[array_index]]['tag']
                     if child_index != 1:
-                        prev_word = dg.nodes[children[array_index - 1]]["word"]
-                        prev_tag = dg.nodes[children[array_index - 1]]["tag"]
-                    head_event = "(head (%s %s) (mods (%s, %s, %s) right))" % (
+                        prev_word = dg.nodes[children[array_index - 1]]['word']
+                        prev_tag = dg.nodes[children[array_index - 1]]['tag']
+                    head_event = '(head (%s %s) (mods (%s, %s, %s) right))' % (
                         child,
                         child_tag,
                         prev_tag,
                         head_word,
                         head_tag,
                     )
-                    mod_event = "(mods (%s, %s, %s) right))" % (
+                    mod_event = '(mods (%s, %s, %s) right))' % (
                         prev_tag,
                         head_word,
                         head_tag,
@@ -633,7 +636,7 @@ def projective_rule_parse_demo():
     )
     print(grammar)
     pdp = ProjectiveDependencyParser(grammar)
-    trees = pdp.parse(["the", "cats", "scratch", "the", "walls"])
+    trees = pdp.parse(['the', 'cats', 'scratch', 'the', 'walls'])
     for tree in trees:
         print(tree)
 
@@ -646,9 +649,9 @@ def arity_parse_demo():
     created by a ``ProjectiveDependencyParser``.
     """
     print()
-    print("A grammar with no arity constraints. Each DependencyProduction")
-    print("specifies a relationship between one head word and only one")
-    print("modifier word.")
+    print('A grammar with no arity constraints. Each DependencyProduction')
+    print('specifies a relationship between one head word and only one')
+    print('modifier word.')
     grammar = DependencyGrammar.fromstring(
         """
     'fell' -> 'price' | 'stock'
@@ -660,18 +663,18 @@ def arity_parse_demo():
     print(grammar)
 
     print()
-    print("For the sentence 'The price of the stock fell', this grammar")
-    print("will produce the following three parses:")
+    print('For the sentence \'The price of the stock fell\', this grammar')
+    print('will produce the following three parses:')
     pdp = ProjectiveDependencyParser(grammar)
-    trees = pdp.parse(["the", "price", "of", "the", "stock", "fell"])
+    trees = pdp.parse(['the', 'price', 'of', 'the', 'stock', 'fell'])
     for tree in trees:
         print(tree)
 
     print()
-    print("By contrast, the following grammar contains a ")
-    print("DependencyProduction that specifies a relationship")
-    print("between a single head word, 'price', and two modifier")
-    print("words, 'of' and 'the'.")
+    print('By contrast, the following grammar contains a ')
+    print('DependencyProduction that specifies a relationship')
+    print('between a single head word, \'price\', and two modifier')
+    print('words, \'of\' and \'the\'.')
     grammar = DependencyGrammar.fromstring(
         """
     'fell' -> 'price' | 'stock'
@@ -684,10 +687,10 @@ def arity_parse_demo():
 
     print()
     print(
-        "This constrains the number of possible parses to just one:"
+        'This constrains the number of possible parses to just one:'
     )  # unimplemented, soon to replace
     pdp = ProjectiveDependencyParser(grammar)
-    trees = pdp.parse(["the", "price", "of", "the", "stock", "fell"])
+    trees = pdp.parse(['the', 'price', 'of', 'the', 'stock', 'fell'])
     for tree in trees:
         print(tree)
 
@@ -699,17 +702,17 @@ def projective_prob_parse_demo():
     """
     from nltk.parse.dependencygraph import conll_data2
 
-    graphs = [DependencyGraph(entry) for entry in conll_data2.split("\n\n") if entry]
+    graphs = [DependencyGraph(entry) for entry in conll_data2.split('\n\n') if entry]
     ppdp = ProbabilisticProjectiveDependencyParser()
-    print("Training Probabilistic Projective Dependency Parser...")
+    print('Training Probabilistic Projective Dependency Parser...')
     ppdp.train(graphs)
 
-    sent = ["Cathy", "zag", "hen", "wild", "zwaaien", "."]
-    print("Parsing '", " ".join(sent), "'...")
-    print("Parse:")
+    sent = ['Cathy', 'zag', 'hen', 'wild', 'zwaaien', '.']
+    print('Parsing \'', " ".join(sent), '\'...')
+    print('Parse:')
     for tree in ppdp.parse(sent):
         print(tree)
 
 
-if __name__ == "__main__":
+if __name__ == '__main__':
     demo()
index 8496d4c..a9ab322 100644 (file)
@@ -1,13 +1,15 @@
 # Natural Language Toolkit: Recursive Descent Parser
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Edward Loper <edloper@gmail.com>
 #         Steven Bird <stevenbird1@gmail.com>
 # URL: <http://nltk.org/>
 # For license information, see LICENSE.TXT
+from __future__ import print_function, unicode_literals
 
 from nltk.grammar import Nonterminal
 from nltk.tree import Tree, ImmutableTree
+from nltk.compat import unicode_repr
 
 from nltk.parse.api import ParserI
 
@@ -278,17 +280,17 @@ class RecursiveDescentParser(ParserI):
         """
 
         if treeloc == ():
-            print("*", end=" ")
+            print("*", end=' ')
         if isinstance(tree, Tree):
             if len(tree) == 0:
-                print(repr(Nonterminal(tree.label())), end=" ")
+                print(unicode_repr(Nonterminal(tree.label())), end=' ')
             for i in range(len(tree)):
                 if treeloc is not None and i == treeloc[0]:
                     self._trace_fringe(tree[i], treeloc[1:])
                 else:
                     self._trace_fringe(tree[i])
         else:
-            print(repr(tree), end=" ")
+            print(unicode_repr(tree), end=' ')
 
     def _trace_tree(self, tree, frontier, operation):
         """
@@ -299,48 +301,48 @@ class RecursiveDescentParser(ParserI):
         :rtype: None
         """
         if self._trace == 2:
-            print("  %c [" % operation, end=" ")
+            print('  %c [' % operation, end=' ')
         else:
-            print("    [", end=" ")
+            print('    [', end=' ')
         if len(frontier) > 0:
             self._trace_fringe(tree, frontier[0])
         else:
             self._trace_fringe(tree)
-        print("]")
+        print(']')
 
     def _trace_start(self, tree, frontier, text):
-        print("Parsing %r" % " ".join(text))
+        print('Parsing %r' % " ".join(text))
         if self._trace > 2:
-            print("Start:")
+            print('Start:')
         if self._trace > 1:
-            self._trace_tree(tree, frontier, " ")
+            self._trace_tree(tree, frontier, ' ')
 
     def _trace_expand(self, tree, frontier, production):
         if self._trace > 2:
-            print("Expand: %s" % production)
+            print('Expand: %s' % production)
         if self._trace > 1:
-            self._trace_tree(tree, frontier, "E")
+            self._trace_tree(tree, frontier, 'E')
 
     def _trace_match(self, tree, frontier, tok):
         if self._trace > 2:
-            print("Match: %r" % tok)
+            print('Match: %r' % tok)
         if self._trace > 1:
-            self._trace_tree(tree, frontier, "M")
+            self._trace_tree(tree, frontier, 'M')
 
     def _trace_succeed(self, tree, frontier):
         if self._trace > 2:
-            print("GOOD PARSE:")
+            print('GOOD PARSE:')
         if self._trace == 1:
-            print("Found a parse:\n%s" % tree)
+            print('Found a parse:\n%s' % tree)
         if self._trace > 1:
-            self._trace_tree(tree, frontier, "+")
+            self._trace_tree(tree, frontier, '+')
 
     def _trace_backtrack(self, tree, frontier, toks=None):
         if self._trace > 2:
             if toks:
-                print("Backtrack: %r match failed" % toks[0])
+                print('Backtrack: %r match failed' % toks[0])
             else:
-                print("Backtrack")
+                print('Backtrack')
 
 
 ##//////////////////////////////////////////////////////
@@ -678,11 +680,11 @@ def demo():
     for prod in grammar.productions():
         print(prod)
 
-    sent = "I saw a man in the park".split()
+    sent = 'I saw a man in the park'.split()
     parser = parse.RecursiveDescentParser(grammar, trace=2)
     for p in parser.parse(sent):
         print(p)
 
 
-if __name__ == "__main__":
+if __name__ == '__main__':
     demo()
index 5991465..a3514db 100644 (file)
@@ -1,13 +1,15 @@
 # Natural Language Toolkit: Shift-Reduce Parser
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Edward Loper <edloper@gmail.com>
 #         Steven Bird <stevenbird1@gmail.com>
 # URL: <http://nltk.org/>
 # For license information, see LICENSE.TXT
+from __future__ import print_function, unicode_literals
 
 from nltk.grammar import Nonterminal
 from nltk.tree import Tree
+from nltk.compat import unicode_repr
 
 from nltk.parse.api import ParserI
 
@@ -86,7 +88,7 @@ class ShiftReduceParser(ParserI):
 
         # Trace output.
         if self._trace:
-            print("Parsing %r" % " ".join(tokens))
+            print('Parsing %r' % " ".join(tokens))
             self._trace_stack(stack, remaining_text)
 
         # iterate through the text, pushing the token onto
@@ -214,7 +216,7 @@ class ShiftReduceParser(ParserI):
         # 3: display which tokens & productions are shifed/reduced
         self._trace = trace
 
-    def _trace_stack(self, stack, remaining_text, marker=" "):
+    def _trace_stack(self, stack, remaining_text, marker=' '):
         """
         Print trace output displaying the given stack and text.
 
@@ -223,13 +225,13 @@ class ShiftReduceParser(ParserI):
             stack.  This is used with trace level 2 to print 'S'
             before shifted stacks and 'R' before reduced stacks.
         """
-        s = "  " + marker + " [ "
+        s = '  ' + marker + ' [ '
         for elt in stack:
             if isinstance(elt, Tree):
-                s += repr(Nonterminal(elt.label())) + " "
+                s += unicode_repr(Nonterminal(elt.label())) + ' '
             else:
-                s += repr(elt) + " "
-        s += "* " + " ".join(remaining_text) + "]"
+                s += unicode_repr(elt) + ' '
+        s += '* ' + ' '.join(remaining_text) + ']'
         print(s)
 
     def _trace_shift(self, stack, remaining_text):
@@ -239,9 +241,9 @@ class ShiftReduceParser(ParserI):
         :rtype: None
         """
         if self._trace > 2:
-            print("Shift %r:" % stack[-1])
+            print('Shift %r:' % stack[-1])
         if self._trace == 2:
-            self._trace_stack(stack, remaining_text, "S")
+            self._trace_stack(stack, remaining_text, 'S')
         elif self._trace > 0:
             self._trace_stack(stack, remaining_text)
 
@@ -254,9 +256,9 @@ class ShiftReduceParser(ParserI):
         """
         if self._trace > 2:
             rhs = " ".join(production.rhs())
-            print("Reduce %r <- %s" % (production.lhs(), rhs))
+            print('Reduce %r <- %s' % (production.lhs(), rhs))
         if self._trace == 2:
-            self._trace_stack(stack, remaining_text, "R")
+            self._trace_stack(stack, remaining_text, 'R')
         elif self._trace > 1:
             self._trace_stack(stack, remaining_text)
 
@@ -277,7 +279,7 @@ class ShiftReduceParser(ParserI):
                 rhs1 = productions[i].rhs()
                 rhs2 = productions[j].rhs()
                 if rhs1[: len(rhs2)] == rhs2:
-                    print("Warning: %r will never be used" % productions[i])
+                    print('Warning: %r will never be used' % productions[i])
 
 
 ##//////////////////////////////////////////////////////
@@ -468,12 +470,12 @@ def demo():
     """
     )
 
-    sent = "I saw a man in the park".split()
+    sent = 'I saw a man in the park'.split()
 
     parser = parse.ShiftReduceParser(grammar, trace=2)
     for p in parser.parse(sent):
         print(p)
 
 
-if __name__ == "__main__":
+if __name__ == '__main__':
     demo()
index 4350b35..8943df1 100644 (file)
@@ -1,18 +1,22 @@
 # -*- coding: utf-8 -*-
 # Natural Language Toolkit: Interface to the Stanford Parser
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Steven Xu <xxu@student.unimelb.edu.au>
 #
 # URL: <http://nltk.org/>
 # For license information, see LICENSE.TXT
 
+from __future__ import unicode_literals
+
 import tempfile
 import os
 import warnings
 from unittest import skip
 from subprocess import PIPE
 
+from six import text_type
+
 from nltk.internals import (
     find_jar_iter,
     config_java,
@@ -25,15 +29,15 @@ from nltk.parse.api import ParserI
 from nltk.parse.dependencygraph import DependencyGraph
 from nltk.tree import Tree
 
-_stanford_url = "https://nlp.stanford.edu/software/lex-parser.shtml"
+_stanford_url = 'https://nlp.stanford.edu/software/lex-parser.shtml'
 
 
 class GenericStanfordParser(ParserI):
     """Interface to the Stanford Parser"""
 
-    _MODEL_JAR_PATTERN = r"stanford-parser-(\d+)(\.(\d+))+-models\.jar"
-    _JAR = r"stanford-parser\.jar"
-    _MAIN_CLASS = "edu.stanford.nlp.parser.lexparser.LexicalizedParser"
+    _MODEL_JAR_PATTERN = r'stanford-parser-(\d+)(\.(\d+))+-models\.jar'
+    _JAR = r'stanford-parser\.jar'
+    _MAIN_CLASS = 'edu.stanford.nlp.parser.lexparser.LexicalizedParser'
 
     _USE_STDIN = False
     _DOUBLE_SPACED_OUTPUT = False
@@ -42,11 +46,11 @@ class GenericStanfordParser(ParserI):
         self,
         path_to_jar=None,
         path_to_models_jar=None,
-        model_path="edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz",
-        encoding="utf8",
+        model_path='edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz',
+        encoding='utf8',
         verbose=False,
-        java_options="-mx4g",
-        corenlp_options="",
+        java_options='-mx4g',
+        corenlp_options='',
     ):
 
         # find the most recent code and model jar
@@ -54,7 +58,7 @@ class GenericStanfordParser(ParserI):
             find_jar_iter(
                 self._JAR,
                 path_to_jar,
-                env_vars=("STANFORD_PARSER", "STANFORD_CORENLP"),
+                env_vars=('STANFORD_PARSER', 'STANFORD_CORENLP'),
                 searchpath=(),
                 url=_stanford_url,
                 verbose=verbose,
@@ -67,7 +71,7 @@ class GenericStanfordParser(ParserI):
             find_jar_iter(
                 self._MODEL_JAR_PATTERN,
                 path_to_models_jar,
-                env_vars=("STANFORD_MODELS", "STANFORD_CORENLP"),
+                env_vars=('STANFORD_MODELS', 'STANFORD_CORENLP'),
                 searchpath=(),
                 url=_stanford_url,
                 verbose=verbose,
@@ -93,17 +97,17 @@ class GenericStanfordParser(ParserI):
         cur_trees = []
         blank = False
         for line in output_.splitlines(False):
-            if line == "":
+            if line == '':
                 if blank:
                     res.append(iter(cur_trees))
                     cur_trees = []
                     blank = False
                 elif self._DOUBLE_SPACED_OUTPUT:
-                    cur_trees.append(self._make_tree("\n".join(cur_lines)))
+                    cur_trees.append(self._make_tree('\n'.join(cur_lines)))
                     cur_lines = []
                     blank = True
                 else:
-                    res.append(iter([self._make_tree("\n".join(cur_lines))]))
+                    res.append(iter([self._make_tree('\n'.join(cur_lines))]))
                     cur_lines = []
             else:
                 cur_lines.append(line)
@@ -125,19 +129,19 @@ class GenericStanfordParser(ParserI):
         """
         cmd = [
             self._MAIN_CLASS,
-            "-model",
+            '-model',
             self.model_path,
-            "-sentences",
-            "newline",
-            "-outputFormat",
+            '-sentences',
+            'newline',
+            '-outputFormat',
             self._OUTPUT_FORMAT,
-            "-tokenized",
-            "-escaper",
-            "edu.stanford.nlp.process.PTBEscapingProcessor",
+            '-tokenized',
+            '-escaper',
+            'edu.stanford.nlp.process.PTBEscapingProcessor',
         ]
         return self._parse_trees_output(
             self._execute(
-                cmd, "\n".join(" ".join(sentence) for sentence in sentences), verbose
+                cmd, '\n'.join(' '.join(sentence) for sentence in sentences), verbose
             )
         )
 
@@ -165,15 +169,15 @@ class GenericStanfordParser(ParserI):
         """
         cmd = [
             self._MAIN_CLASS,
-            "-model",
+            '-model',
             self.model_path,
-            "-sentences",
-            "newline",
-            "-outputFormat",
+            '-sentences',
+            'newline',
+            '-outputFormat',
             self._OUTPUT_FORMAT,
         ]
         return self._parse_trees_output(
-            self._execute(cmd, "\n".join(sentences), verbose)
+            self._execute(cmd, '\n'.join(sentences), verbose)
         )
 
     def tagged_parse(self, sentence, verbose=False):
@@ -198,29 +202,29 @@ class GenericStanfordParser(ParserI):
         :type sentences: list(list(tuple(str, str)))
         :rtype: iter(iter(Tree))
         """
-        tag_separator = "/"
+        tag_separator = '/'
         cmd = [
             self._MAIN_CLASS,
-            "-model",
+            '-model',
             self.model_path,
-            "-sentences",
-            "newline",
-            "-outputFormat",
+            '-sentences',
+            'newline',
+            '-outputFormat',
             self._OUTPUT_FORMAT,
-            "-tokenized",
-            "-tagSeparator",
+            '-tokenized',
+            '-tagSeparator',
             tag_separator,
-            "-tokenizerFactory",
-            "edu.stanford.nlp.process.WhitespaceTokenizer",
-            "-tokenizerMethod",
-            "newCoreLabelTokenizerFactory",
+            '-tokenizerFactory',
+            'edu.stanford.nlp.process.WhitespaceTokenizer',
+            '-tokenizerMethod',
+            'newCoreLabelTokenizerFactory',
         ]
         # We don't need to escape slashes as "splitting is done on the last instance of the character in the token"
         return self._parse_trees_output(
             self._execute(
                 cmd,
-                "\n".join(
-                    " ".join(tag_separator.join(tagged) for tagged in sentence)
+                '\n'.join(
+                    ' '.join(tag_separator.join(tagged) for tagged in sentence)
                     for sentence in sentences
                 ),
                 verbose,
@@ -229,19 +233,19 @@ class GenericStanfordParser(ParserI):
 
     def _execute(self, cmd, input_, verbose=False):
         encoding = self._encoding
-        cmd.extend(["-encoding", encoding])
+        cmd.extend(['-encoding', encoding])
         if self.corenlp_options:
             cmd.append(self.corenlp_options)
 
-        default_options = " ".join(_java_options)
+        default_options = ' '.join(_java_options)
 
         # Configure java.
         config_java(options=self.java_options, verbose=verbose)
 
         # Windows is incompatible with NamedTemporaryFile() without passing in delete=False.
-        with tempfile.NamedTemporaryFile(mode="wb", delete=False) as input_file:
+        with tempfile.NamedTemporaryFile(mode='wb', delete=False) as input_file:
             # Write the actual sentences to the temporary input file
-            if isinstance(input_, str) and encoding:
+            if isinstance(input_, text_type) and encoding:
                 input_ = input_.encode(encoding)
             input_file.write(input_)
             input_file.flush()
@@ -262,8 +266,8 @@ class GenericStanfordParser(ParserI):
                     cmd, classpath=self._classpath, stdout=PIPE, stderr=PIPE
                 )
 
-            stdout = stdout.replace(b"\xc2\xa0", b" ")
-            stdout = stdout.replace(b"\x00\xa0", b" ")
+            stdout = stdout.replace(b'\xc2\xa0', b' ')
+            stdout = stdout.replace(b'\x00\xa0', b' ')
             stdout = stdout.decode(encoding)
 
         os.unlink(input_file.name)
@@ -325,7 +329,7 @@ class StanfordParser(GenericStanfordParser):
     [Tree('DT', ['the']), Tree('JJ', ['lazy']), Tree('NN', ['dog'])])])]), Tree('.', ['.'])])])]
     """
 
-    _OUTPUT_FORMAT = "penn"
+    _OUTPUT_FORMAT = 'penn'
 
     def __init__(self, *args, **kwargs):
         warnings.warn(
@@ -391,7 +395,7 @@ class StanfordDependencyParser(GenericStanfordParser):
 
     """
 
-    _OUTPUT_FORMAT = "conll2007"
+    _OUTPUT_FORMAT = 'conll2007'
 
     def __init__(self, *args, **kwargs):
         warnings.warn(
@@ -404,11 +408,11 @@ class StanfordDependencyParser(GenericStanfordParser):
         super(StanfordDependencyParser, self).__init__(*args, **kwargs)
 
     def _make_tree(self, result):
-        return DependencyGraph(result, top_relation_label="root")
+        return DependencyGraph(result, top_relation_label='root')
 
 
 class StanfordNeuralDependencyParser(GenericStanfordParser):
-    """
+    '''
     >>> from nltk.parse.stanford import StanfordNeuralDependencyParser
     >>> dep_parser=StanfordNeuralDependencyParser(java_options='-mx4g')
 
@@ -437,12 +441,12 @@ class StanfordNeuralDependencyParser(GenericStanfordParser):
     ... ))], []) # doctest: +NORMALIZE_WHITESPACE
     [Tree('dog', ['I', "'m", 'a']), Tree('cat', ['This', 'is', Tree('friends',
     ['my', "'"]), Tree('tabby', ['-LRB-', 'the', '-RRB-'])])]
-    """
+    '''
 
-    _OUTPUT_FORMAT = "conll"
-    _MAIN_CLASS = "edu.stanford.nlp.pipeline.StanfordCoreNLP"
-    _JAR = r"stanford-corenlp-(\d+)(\.(\d+))+\.jar"
-    _MODEL_JAR_PATTERN = r"stanford-corenlp-(\d+)(\.(\d+))+-models\.jar"
+    _OUTPUT_FORMAT = 'conll'
+    _MAIN_CLASS = 'edu.stanford.nlp.pipeline.StanfordCoreNLP'
+    _JAR = r'stanford-corenlp-(\d+)(\.(\d+))+\.jar'
+    _MODEL_JAR_PATTERN = r'stanford-corenlp-(\d+)(\.(\d+))+-models\.jar'
     _USE_STDIN = True
     _DOUBLE_SPACED_OUTPUT = True
 
@@ -455,22 +459,22 @@ class StanfordNeuralDependencyParser(GenericStanfordParser):
         )
 
         super(StanfordNeuralDependencyParser, self).__init__(*args, **kwargs)
-        self.corenlp_options += "-annotators tokenize,ssplit,pos,depparse"
+        self.corenlp_options += '-annotators tokenize,ssplit,pos,depparse'
 
     def tagged_parse_sents(self, sentences, verbose=False):
-        """
+        '''
         Currently unimplemented because the neural dependency parser (and
         the StanfordCoreNLP pipeline class) doesn't support passing in pre-
         tagged tokens.
-        """
+        '''
         raise NotImplementedError(
-            "tagged_parse[_sents] is not supported by "
-            "StanfordNeuralDependencyParser; use "
-            "parse[_sents] or raw_parse[_sents] instead."
+            'tagged_parse[_sents] is not supported by '
+            'StanfordNeuralDependencyParser; use '
+            'parse[_sents] or raw_parse[_sents] instead.'
         )
 
     def _make_tree(self, result):
-        return DependencyGraph(result, top_relation_label="ROOT")
+        return DependencyGraph(result, top_relation_label='ROOT')
 
 
 @skip("doctests from nltk.parse.stanford are skipped because it's deprecated")
@@ -479,10 +483,10 @@ def setup_module(module):
 
     try:
         StanfordParser(
-            model_path="edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz"
+            model_path='edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz'
         )
         StanfordNeuralDependencyParser()
     except LookupError:
         raise SkipTest(
-            "doctests from nltk.parse.stanford are skipped because one of the stanford parser or CoreNLP jars doesn't exist"
+            'doctests from nltk.parse.stanford are skipped because one of the stanford parser or CoreNLP jars doesn\'t exist'
         )
index 6615288..a60bc37 100644 (file)
@@ -2,10 +2,13 @@
 #
 # Author: Long Duong <longdt219@gmail.com>
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # URL: <http://nltk.org/>
 # For license information, see LICENSE.TXT
 
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
 import tempfile
 import pickle
 
@@ -52,11 +55,11 @@ class Configuration(object):
 
     def __str__(self):
         return (
-            "Stack : "
+            'Stack : '
             + str(self.stack)
-            + "  Buffer : "
+            + '  Buffer : '
             + str(self.buffer)
-            + "   Arcs : "
+            + '   Arcs : '
             + str(self.arcs)
         )
 
@@ -67,10 +70,10 @@ class Configuration(object):
         """
         if feat is None:
             return False
-        if feat == "":
+        if feat == '':
             return False
         if flag is False:
-            if feat == "_":
+            if feat == '_':
                 return False
         return True
 
@@ -88,28 +91,28 @@ class Configuration(object):
             # Stack 0
             stack_idx0 = self.stack[len(self.stack) - 1]
             token = self._tokens[stack_idx0]
-            if self._check_informative(token["word"], True):
-                result.append("STK_0_FORM_" + token["word"])
-            if "lemma" in token and self._check_informative(token["lemma"]):
-                result.append("STK_0_LEMMA_" + token["lemma"])
-            if self._check_informative(token["tag"]):
-                result.append("STK_0_POS_" + token["tag"])
-            if "feats" in token and self._check_informative(token["feats"]):
-                feats = token["feats"].split("|")
+            if self._check_informative(token['word'], True):
+                result.append('STK_0_FORM_' + token['word'])
+            if 'lemma' in token and self._check_informative(token['lemma']):
+                result.append('STK_0_LEMMA_' + token['lemma'])
+            if self._check_informative(token['tag']):
+                result.append('STK_0_POS_' + token['tag'])
+            if 'feats' in token and self._check_informative(token['feats']):
+                feats = token['feats'].split("|")
                 for feat in feats:
-                    result.append("STK_0_FEATS_" + feat)
+                    result.append('STK_0_FEATS_' + feat)
             # Stack 1
             if len(self.stack) > 1:
                 stack_idx1 = self.stack[len(self.stack) - 2]
                 token = self._tokens[stack_idx1]
-                if self._check_informative(token["tag"]):
-                    result.append("STK_1_POS_" + token["tag"])
+                if self._check_informative(token['tag']):
+                    result.append('STK_1_POS_' + token['tag'])
 
             # Left most, right most dependency of stack[0]
             left_most = 1000000
             right_most = -1
-            dep_left_most = ""
-            dep_right_most = ""
+            dep_left_most = ''
+            dep_right_most = ''
             for (wi, r, wj) in self.arcs:
                 if wi == stack_idx0:
                     if (wj > wi) and (wj > right_most):
@@ -119,48 +122,48 @@ class Configuration(object):
                         left_most = wj
                         dep_left_most = r
             if self._check_informative(dep_left_most):
-                result.append("STK_0_LDEP_" + dep_left_most)
+                result.append('STK_0_LDEP_' + dep_left_most)
             if self._check_informative(dep_right_most):
-                result.append("STK_0_RDEP_" + dep_right_most)
+                result.append('STK_0_RDEP_' + dep_right_most)
 
         # Check Buffered 0
         if len(self.buffer) > 0:
             # Buffer 0
             buffer_idx0 = self.buffer[0]
             token = self._tokens[buffer_idx0]
-            if self._check_informative(token["word"], True):
-                result.append("BUF_0_FORM_" + token["word"])
-            if "lemma" in token and self._check_informative(token["lemma"]):
-                result.append("BUF_0_LEMMA_" + token["lemma"])
-            if self._check_informative(token["tag"]):
-                result.append("BUF_0_POS_" + token["tag"])
-            if "feats" in token and self._check_informative(token["feats"]):
-                feats = token["feats"].split("|")
+            if self._check_informative(token['word'], True):
+                result.append('BUF_0_FORM_' + token['word'])
+            if 'lemma' in token and self._check_informative(token['lemma']):
+                result.append('BUF_0_LEMMA_' + token['lemma'])
+            if self._check_informative(token['tag']):
+                result.append('BUF_0_POS_' + token['tag'])
+            if 'feats' in token and self._check_informative(token['feats']):
+                feats = token['feats'].split("|")
                 for feat in feats:
-                    result.append("BUF_0_FEATS_" + feat)
+                    result.append('BUF_0_FEATS_' + feat)
             # Buffer 1
             if len(self.buffer) > 1:
                 buffer_idx1 = self.buffer[1]
                 token = self._tokens[buffer_idx1]
-                if self._check_informative(token["word"], True):
-                    result.append("BUF_1_FORM_" + token["word"])
-                if self._check_informative(token["tag"]):
-                    result.append("BUF_1_POS_" + token["tag"])
+                if self._check_informative(token['word'], True):
+                    result.append('BUF_1_FORM_' + token['word'])
+                if self._check_informative(token['tag']):
+                    result.append('BUF_1_POS_' + token['tag'])
             if len(self.buffer) > 2:
                 buffer_idx2 = self.buffer[2]
                 token = self._tokens[buffer_idx2]
-                if self._check_informative(token["tag"]):
-                    result.append("BUF_2_POS_" + token["tag"])
+                if self._check_informative(token['tag']):
+                    result.append('BUF_2_POS_' + token['tag'])
             if len(self.buffer) > 3:
                 buffer_idx3 = self.buffer[3]
                 token = self._tokens[buffer_idx3]
-                if self._check_informative(token["tag"]):
-                    result.append("BUF_3_POS_" + token["tag"])
+                if self._check_informative(token['tag']):
+                    result.append('BUF_3_POS_' + token['tag'])
                     # Left most, right most dependency of stack[0]
             left_most = 1000000
             right_most = -1
-            dep_left_most = ""
-            dep_right_most = ""
+            dep_left_most = ''
+            dep_right_most = ''
             for (wi, r, wj) in self.arcs:
                 if wi == buffer_idx0:
                     if (wj > wi) and (wj > right_most):
@@ -170,9 +173,9 @@ class Configuration(object):
                         left_most = wj
                         dep_left_most = r
             if self._check_informative(dep_left_most):
-                result.append("BUF_0_LDEP_" + dep_left_most)
+                result.append('BUF_0_LDEP_' + dep_left_most)
             if self._check_informative(dep_right_most):
-                result.append("BUF_0_RDEP_" + dep_right_most)
+                result.append('BUF_0_RDEP_' + dep_right_most)
 
         return result
 
@@ -184,10 +187,10 @@ class Transition(object):
     """
 
     # Define set of transitions
-    LEFT_ARC = "LEFTARC"
-    RIGHT_ARC = "RIGHTARC"
-    SHIFT = "SHIFT"
-    REDUCE = "REDUCE"
+    LEFT_ARC = 'LEFTARC'
+    RIGHT_ARC = 'RIGHTARC'
+    SHIFT = 'SHIFT'
+    REDUCE = 'REDUCE'
 
     def __init__(self, alg_option):
         """
@@ -290,8 +293,8 @@ class TransitionParser(ParserI):
     Class for transition based parser. Implement 2 algorithms which are "arc-standard" and "arc-eager"
     """
 
-    ARC_STANDARD = "arc-standard"
-    ARC_EAGER = "arc-eager"
+    ARC_STANDARD = 'arc-standard'
+    ARC_EAGER = 'arc-eager'
 
     def __init__(self, algorithm):
         """
@@ -313,11 +316,11 @@ class TransitionParser(ParserI):
         p_node = depgraph.nodes[idx_parent]
         c_node = depgraph.nodes[idx_child]
 
-        if c_node["word"] is None:
+        if c_node['word'] is None:
             return None  # Root word
 
-        if c_node["head"] == p_node["address"]:
-            return c_node["rel"]
+        if c_node['head'] == p_node['address']:
+            return c_node['rel']
         else:
             return None
 
@@ -333,8 +336,8 @@ class TransitionParser(ParserI):
             unsorted_result.append(self._dictionary[feature])
 
         # Default value of each feature is 1.0
-        return " ".join(
-            str(featureID) + ":1.0" for featureID in sorted(unsorted_result)
+        return ' '.join(
+            str(featureID) + ':1.0' for featureID in sorted(unsorted_result)
         )
 
     def _is_projective(self, depgraph):
@@ -342,9 +345,9 @@ class TransitionParser(ParserI):
         for key in depgraph.nodes:
             node = depgraph.nodes[key]
 
-            if "head" in node:
-                childIdx = node["address"]
-                parentIdx = node["head"]
+            if 'head' in node:
+                childIdx = node['address']
+                parentIdx = node['head']
                 if parentIdx is not None:
                     arc_list.append((parentIdx, childIdx))
 
@@ -370,8 +373,8 @@ class TransitionParser(ParserI):
         self._transition.setdefault(key, len(self._transition) + 1)
         self._match_transition[self._transition[key]] = key
 
-        input_str = str(self._transition[key]) + " " + binary_features + "\n"
-        input_file.write(input_str.encode("utf-8"))
+        input_str = str(self._transition[key]) + ' ' + binary_features + '\n'
+        input_file.write(input_str.encode('utf-8'))
 
     def _create_training_examples_arc_std(self, depgraphs, input_file):
         """
@@ -398,7 +401,7 @@ class TransitionParser(ParserI):
                     # Left-arc operation
                     rel = self._get_dep_relation(b0, s0, depgraph)
                     if rel is not None:
-                        key = Transition.LEFT_ARC + ":" + rel
+                        key = Transition.LEFT_ARC + ':' + rel
                         self._write_to_file(key, binary_features, input_file)
                         operation.left_arc(conf, rel)
                         training_seq.append(key)
@@ -419,7 +422,7 @@ class TransitionParser(ParserI):
                                         precondition = False
 
                         if precondition:
-                            key = Transition.RIGHT_ARC + ":" + rel
+                            key = Transition.RIGHT_ARC + ':' + rel
                             self._write_to_file(key, binary_features, input_file)
                             operation.right_arc(conf, rel)
                             training_seq.append(key)
@@ -460,7 +463,7 @@ class TransitionParser(ParserI):
                     # Left-arc operation
                     rel = self._get_dep_relation(b0, s0, depgraph)
                     if rel is not None:
-                        key = Transition.LEFT_ARC + ":" + rel
+                        key = Transition.LEFT_ARC + ':' + rel
                         self._write_to_file(key, binary_features, input_file)
                         operation.left_arc(conf, rel)
                         training_seq.append(key)
@@ -469,7 +472,7 @@ class TransitionParser(ParserI):
                     # Right-arc operation
                     rel = self._get_dep_relation(s0, b0, depgraph)
                     if rel is not None:
-                        key = Transition.RIGHT_ARC + ":" + rel
+                        key = Transition.RIGHT_ARC + ':' + rel
                         self._write_to_file(key, binary_features, input_file)
                         operation.right_arc(conf, rel)
                         training_seq.append(key)
@@ -509,7 +512,7 @@ class TransitionParser(ParserI):
 
         try:
             input_file = tempfile.NamedTemporaryFile(
-                prefix="transition_parse.train", dir=tempfile.gettempdir(), delete=False
+                prefix='transition_parse.train', dir=tempfile.gettempdir(), delete=False
             )
 
             if self._algorithm == self.ARC_STANDARD:
@@ -525,7 +528,7 @@ class TransitionParser(ParserI):
             # Todo : because of probability = True => very slow due to
             # cross-validation. Need to improve the speed here
             model = svm.SVC(
-                kernel="poly",
+                kernel='poly',
                 degree=2,
                 coef0=0,
                 gamma=0.2,
@@ -536,7 +539,7 @@ class TransitionParser(ParserI):
 
             model.fit(x_train, y_train)
             # Save the model to file name (as pickle)
-            pickle.dump(model, open(modelfile, "wb"))
+            pickle.dump(model, open(modelfile, 'wb'))
         finally:
             remove(input_file.name)
 
@@ -550,7 +553,7 @@ class TransitionParser(ParserI):
         """
         result = []
         # First load the model
-        model = pickle.load(open(modelFile, "rb"))
+        model = pickle.load(open(modelFile, 'rb'))
         operation = Transition(self._algorithm)
 
         for depgraph in depgraphs:
@@ -635,13 +638,13 @@ class TransitionParser(ParserI):
             new_depgraph = deepcopy(depgraph)
             for key in new_depgraph.nodes:
                 node = new_depgraph.nodes[key]
-                node["rel"] = ""
+                node['rel'] = ''
                 # With the default, all the token depend on the Root
-                node["head"] = 0
+                node['head'] = 0
             for (head, rel, child) in conf.arcs:
                 c_node = new_depgraph.nodes[child]
-                c_node["head"] = head
-                c_node["rel"] = rel
+                c_node['head'] = head
+                c_node['rel'] = rel
             result.append(new_depgraph)
 
         return result
index 34630a0..6ebe146 100644 (file)
@@ -2,7 +2,7 @@
 #
 # Author: Ewan Klein <ewan@inf.ed.ac.uk>
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # URL: <http://nltk.org/>
 # For license information, see LICENSE.TXT
 
@@ -10,6 +10,7 @@
 """
 Utility functions for parsers.
 """
+from __future__ import print_function
 
 from nltk.grammar import CFG, FeatureGrammar, PCFG
 from nltk.data import load
@@ -97,7 +98,7 @@ def taggedsent_to_conll(sentence):
     :return: a generator yielding a single sentence in CONLL format.
     """
     for (i, (word, tag)) in enumerate(sentence, start=1):
-        input_str = [str(i), word, "_", tag, tag, "_", "0", "a", "_", "_"]
+        input_str = [str(i), word, '_', tag, tag, '_', '0', 'a', '_', '_']
         input_str = "\t".join(input_str) + "\n"
         yield input_str
 
@@ -138,7 +139,7 @@ def taggedsents_to_conll(sentences):
     for sentence in sentences:
         for input_str in taggedsent_to_conll(sentence):
             yield input_str
-        yield "\n\n"
+        yield '\n\n'
 
 
 ######################################################################
@@ -169,8 +170,8 @@ class TestGrammar(object):
         according to the grammar, then the value of ``trees`` will be None.
         """
         for test in self.suite:
-            print(test["doc"] + ":", end=" ")
-            for key in ["accept", "reject"]:
+            print(test['doc'] + ":", end=' ')
+            for key in ['accept', 'reject']:
                 for sent in test[key]:
                     tokens = sent.split()
                     trees = list(self.cp.parse(tokens))
@@ -179,7 +180,7 @@ class TestGrammar(object):
                         print(sent)
                         for tree in trees:
                             print(tree)
-                    if key == "accept":
+                    if key == 'accept':
                         if trees == []:
                             raise ValueError("Sentence '%s' failed to parse'" % sent)
                         else:
@@ -212,14 +213,14 @@ def extract_test_sentences(string, comment_chars="#%;", encoding=None):
     if encoding is not None:
         string = string.decode(encoding)
     sentences = []
-    for sentence in string.split("\n"):
-        if sentence == "" or sentence[0] in comment_chars:
+    for sentence in string.split('\n'):
+        if sentence == '' or sentence[0] in comment_chars:
             continue
-        split_info = sentence.split(":", 1)
+        split_info = sentence.split(':', 1)
         result = None
         if len(split_info) == 2:
-            if split_info[0] in ["True", "true", "False", "false"]:
-                result = split_info[0] in ["True", "true"]
+            if split_info[0] in ['True', 'true', 'False', 'false']:
+                result = split_info[0] in ['True', 'true']
                 sentence = split_info[1]
             else:
                 result = int(split_info[0])
index bcb9687..7f6217e 100644 (file)
@@ -1,13 +1,15 @@
 # Natural Language Toolkit: Viterbi Probabilistic Parser
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Edward Loper <edloper@gmail.com>
 #         Steven Bird <stevenbird1@gmail.com>
 # URL: <http://nltk.org/>
 # For license information, see LICENSE.TXT
+from __future__ import print_function, unicode_literals
 
 from functools import reduce
 from nltk.tree import Tree, ProbabilisticTree
+from nltk.compat import python_2_unicode_compatible
 
 from nltk.parse.api import ParserI
 
@@ -16,6 +18,7 @@ from nltk.parse.api import ParserI
 ##//////////////////////////////////////////////////////
 
 
+@python_2_unicode_compatible
 class ViterbiParser(ParserI):
     """
     A bottom-up ``PCFG`` parser that uses dynamic programming to find
@@ -121,7 +124,7 @@ class ViterbiParser(ParserI):
         # Initialize the constituents dictionary with the words from
         # the text.
         if self._trace:
-            print(("Inserting tokens into the most likely" + " constituents table..."))
+            print(('Inserting tokens into the most likely' + ' constituents table...'))
         for index in range(len(tokens)):
             token = tokens[index]
             constituents[index, index + 1, token] = token
@@ -134,8 +137,8 @@ class ViterbiParser(ParserI):
             if self._trace:
                 print(
                     (
-                        "Finding the most likely constituents"
-                        + " spanning %d text elements..." % length
+                        'Finding the most likely constituents'
+                        + ' spanning %d text elements...' % length
                     )
                 )
             for start in range(len(tokens) - length + 1):
@@ -207,9 +210,9 @@ class ViterbiParser(ParserI):
                 if self._trace > 1:
                     if c is None or c != tree:
                         if c is None or c.prob() < tree.prob():
-                            print("   Insert:", end=" ")
+                            print('   Insert:', end=' ')
                         else:
-                            print("  Discard:", end=" ")
+                            print('  Discard:', end=' ')
                         self._trace_production(production, p, span, len(tokens))
                 if c is None or c.prob() < tree.prob():
                     constituents[span[0], span[1], production.lhs()] = tree
@@ -305,22 +308,22 @@ class ViterbiParser(ParserI):
         :rtype: None
         """
 
-        str = "|" + "." * span[0]
-        str += "=" * (span[1] - span[0])
-        str += "." * (width - span[1]) + "| "
-        str += "%s" % production
+        str = '|' + '.' * span[0]
+        str += '=' * (span[1] - span[0])
+        str += '.' * (width - span[1]) + '| '
+        str += '%s' % production
         if self._trace > 2:
-            str = "%-40s %12.10f " % (str, p)
+            str = '%-40s %12.10f ' % (str, p)
 
         print(str)
 
     def _trace_lexical_insertion(self, token, index, width):
-        str = "   Insert: |" + "." * index + "=" + "." * (width - index - 1) + "| "
-        str += "%s" % (token,)
+        str = '   Insert: |' + '.' * index + '=' + '.' * (width - index - 1) + '| '
+        str += '%s' % (token,)
         print(str)
 
     def __repr__(self):
-        return "<ViterbiParser for %r>" % self._grammar
+        return '<ViterbiParser for %r>' % self._grammar
 
 
 ##//////////////////////////////////////////////////////
@@ -342,22 +345,22 @@ def demo():
 
     # Define two demos.  Each demo has a sentence and a grammar.
     demos = [
-        ("I saw the man with my telescope", toy_pcfg1),
-        ("the boy saw Jack with Bob under the table with a telescope", toy_pcfg2),
+        ('I saw the man with my telescope', toy_pcfg1),
+        ('the boy saw Jack with Bob under the table with a telescope', toy_pcfg2),
     ]
 
     # Ask the user which demo they want to use.
     print()
     for i in range(len(demos)):
-        print("%3s: %s" % (i + 1, demos[i][0]))
-        print("     %r" % demos[i][1])
+        print('%3s: %s' % (i + 1, demos[i][0]))
+        print('     %r' % demos[i][1])
         print()
-    print("Which demo (%d-%d)? " % (1, len(demos)), end=" ")
+    print('Which demo (%d-%d)? ' % (1, len(demos)), end=' ')
     try:
         snum = int(sys.stdin.readline().strip()) - 1
         sent, grammar = demos[snum]
     except:
-        print("Bad sentence number")
+        print('Bad sentence number')
         return
 
     # Tokenize the sentence.
@@ -366,7 +369,7 @@ def demo():
     parser = ViterbiParser(grammar)
     all_parses = {}
 
-    print("\nsent: %s\nparser: %s\ngrammar: %s" % (sent, parser, grammar))
+    print('\nsent: %s\nparser: %s\ngrammar: %s' % (sent, parser, grammar))
     parser.trace(3)
     t = time.time()
     parses = parser.parse_all(tokens)
@@ -380,33 +383,33 @@ def demo():
 
     # Print some summary statistics
     print()
-    print("Time (secs)   # Parses   Average P(parse)")
-    print("-----------------------------------------")
-    print("%11.4f%11d%19.14f" % (time, num_parses, average))
+    print('Time (secs)   # Parses   Average P(parse)')
+    print('-----------------------------------------')
+    print('%11.4f%11d%19.14f' % (time, num_parses, average))
     parses = all_parses.keys()
     if parses:
         p = reduce(lambda a, b: a + b.prob(), parses, 0) / len(parses)
     else:
         p = 0
-    print("------------------------------------------")
-    print("%11s%11d%19.14f" % ("n/a", len(parses), p))
+    print('------------------------------------------')
+    print('%11s%11d%19.14f' % ('n/a', len(parses), p))
 
     # Ask the user if we should draw the parses.
     print()
-    print("Draw parses (y/n)? ", end=" ")
-    if sys.stdin.readline().strip().lower().startswith("y"):
+    print('Draw parses (y/n)? ', end=' ')
+    if sys.stdin.readline().strip().lower().startswith('y'):
         from nltk.draw.tree import draw_trees
 
-        print("  please wait...")
+        print('  please wait...')
         draw_trees(*parses)
 
     # Ask the user if we should print the parses.
     print()
-    print("Print parses (y/n)? ", end=" ")
-    if sys.stdin.readline().strip().lower().startswith("y"):
+    print('Print parses (y/n)? ', end=' ')
+    if sys.stdin.readline().strip().lower().startswith('y'):
         for parse in parses:
             print(parse)
 
 
-if __name__ == "__main__":
+if __name__ == '__main__':
     demo()
index 5a59c3f..a83af71 100644 (file)
@@ -1,7 +1,7 @@
 # -*- coding: utf-8 -*-
 # Natural Language Toolkit: Probability and Statistics
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Edward Loper <edloper@gmail.com>
 #         Steven Bird <stevenbird1@gmail.com> (additions)
 #         Trevor Cohn <tacohn@cs.mu.oz.au> (additions)
@@ -37,6 +37,7 @@ implementation of the ``ConditionalProbDistI`` interface is
 ``ConditionalProbDist``, a derived distribution.
 
 """
+from __future__ import print_function, unicode_literals, division
 
 import math
 import random
@@ -46,16 +47,19 @@ from collections import defaultdict, Counter
 from functools import reduce
 from abc import ABCMeta, abstractmethod
 
+from six import itervalues, text_type, add_metaclass
+
+from nltk import compat
 from nltk.internals import raise_unorderable_types
 
-_NINF = float("-1e300")
+_NINF = float('-1e300')
 
 ##//////////////////////////////////////////////////////
 ##  Frequency Distributions
 ##//////////////////////////////////////////////////////
 
 
-
+@compat.python_2_unicode_compatible
 class FreqDist(Counter):
     """
     A frequency distribution for the outcomes of an experiment.  A
@@ -242,7 +246,7 @@ class FreqDist(Counter):
         """
         if len(self) == 0:
             raise ValueError(
-                "A FreqDist must have at least one sample before max is defined."
+                'A FreqDist must have at least one sample before max is defined.'
             )
         return self.most_common(1)[0][0]
 
@@ -260,19 +264,19 @@ class FreqDist(Counter):
         :type title: bool
         """
         try:
-            import matplotlib.pyplot as plt
+            from matplotlib import pylab
         except ImportError:
             raise ValueError(
-                "The plot function requires matplotlib to be installed."
-                "See http://matplotlib.org/"
+                'The plot function requires matplotlib to be installed.'
+                'See http://matplotlib.org/'
             )
 
         if len(args) == 0:
             args = [len(self)]
         samples = [item for item, _ in self.most_common(*args)]
 
-        cumulative = _get_kwarg(kwargs, "cumulative", False)
-        percents = _get_kwarg(kwargs, "percents", False)
+        cumulative = _get_kwarg(kwargs, 'cumulative', False)
+        percents = _get_kwarg(kwargs, 'percents', False)
         if cumulative:
             freqs = list(self._cumulative_frequencies(samples))
             ylabel = "Cumulative Counts"
@@ -284,24 +288,17 @@ class FreqDist(Counter):
             ylabel = "Counts"
         # percents = [f * 100 for f in freqs]  only in ProbDist?
 
-        ax = plt.gca()
-        ax.grid(True, color="silver")
-
+        pylab.grid(True, color="silver")
         if "linewidth" not in kwargs:
             kwargs["linewidth"] = 2
         if "title" in kwargs:
-            ax.set_title(kwargs["title"])
+            pylab.title(kwargs["title"])
             del kwargs["title"]
-
-        ax.plot(freqs, **kwargs)
-        ax.set_xticks(range(len(samples)))
-        ax.set_xticklabels([str(s) for s in samples], rotation=90)
-        ax.set_xlabel("Samples")
-        ax.set_ylabel(ylabel)
-
-        plt.show()
-
-        return ax
+        pylab.plot(freqs, **kwargs)
+        pylab.xticks(range(len(samples)), [text_type(s) for s in samples], rotation=90)
+        pylab.xlabel("Samples")
+        pylab.ylabel(ylabel)
+        pylab.show()
 
     def tabulate(self, *args, **kwargs):
         """
@@ -319,21 +316,21 @@ class FreqDist(Counter):
             args = [len(self)]
         samples = [item for item, _ in self.most_common(*args)]
 
-        cumulative = _get_kwarg(kwargs, "cumulative", False)
+        cumulative = _get_kwarg(kwargs, 'cumulative', False)
         if cumulative:
             freqs = list(self._cumulative_frequencies(samples))
         else:
             freqs = [self[sample] for sample in samples]
         # percents = [f * 100 for f in freqs]  only in ProbDist?
 
-        width = max(len("{}".format(s)) for s in samples)
+        width = max(len("%s" % s) for s in samples)
         width = max(width, max(len("%d" % f) for f in freqs))
 
         for i in range(len(samples)):
-            print("%*s" % (width, samples[i]), end=" ")
+            print("%*s" % (width, samples[i]), end=' ')
         print()
         for i in range(len(samples)):
-            print("%*d" % (width, freqs[i]), end=" ")
+            print("%*d" % (width, freqs[i]), end=' ')
         print()
 
     def copy(self):
@@ -387,47 +384,16 @@ class FreqDist(Counter):
         return self.__class__(super(FreqDist, self).__and__(other))
 
     def __le__(self, other):
-        """
-        Returns True if this frequency distribution is a subset of the other
-        and for no key the value exceeds the value of the same key from
-        the other frequency distribution.
-
-        The <= operator forms partial order and satisfying the axioms
-        reflexivity, antisymmetry and transitivity.
-
-        >>> FreqDist('a') <= FreqDist('a')
-        True
-        >>> a = FreqDist('abc')
-        >>> b = FreqDist('aabc')
-        >>> (a <= b, b <= a)
-        (True, False)
-        >>> FreqDist('a') <= FreqDist('abcd')
-        True
-        >>> FreqDist('abc') <= FreqDist('xyz')
-        False
-        >>> FreqDist('xyz') <= FreqDist('abc')
-        False
-        >>> c = FreqDist('a')
-        >>> d = FreqDist('aa')
-        >>> e = FreqDist('aaa')
-        >>> c <= d and d <= e and c <= e
-        True
-        """
         if not isinstance(other, FreqDist):
             raise_unorderable_types("<=", self, other)
         return set(self).issubset(other) and all(
             self[key] <= other[key] for key in self
         )
 
-    def __ge__(self, other):
-        if not isinstance(other, FreqDist):
-            raise_unorderable_types(">=", self, other)
-        return set(self).issuperset(other) and all(
-            self[key] >= other[key] for key in other
-        )
-
+    # @total_ordering doesn't work here, since the class inherits from a builtin class
+    __ge__ = lambda self, other: not self <= other or self == other
     __lt__ = lambda self, other: self <= other and not self == other
-    __gt__ = lambda self, other: self >= other and not self == other
+    __gt__ = lambda self, other: not self <= other
 
     def __repr__(self):
         """
@@ -455,10 +421,10 @@ class FreqDist(Counter):
         :type maxlen: int
         :rtype: string
         """
-        items = ["{0!r}: {1!r}".format(*item) for item in self.most_common(maxlen)]
+        items = ['{0!r}: {1!r}'.format(*item) for item in self.most_common(maxlen)]
         if len(self) > maxlen:
-            items.append("...")
-        return "FreqDist({{{0}}})".format(", ".join(items))
+            items.append('...')
+        return 'FreqDist({{{0}}})'.format(', '.join(items))
 
     def __str__(self):
         """
@@ -466,16 +432,7 @@ class FreqDist(Counter):
 
         :rtype: string
         """
-        return "<FreqDist with %d samples and %d outcomes>" % (len(self), self.N())
-
-    def __iter__(self):
-        """
-        Return an iterator which yields tokens ordered by frequency.
-
-        :rtype: iterator
-        """
-        for token, _ in self.most_common(self.B()):
-            yield token
+        return '<FreqDist with %d samples and %d outcomes>' % (len(self), self.N())
 
 
 ##//////////////////////////////////////////////////////
@@ -483,7 +440,8 @@ class FreqDist(Counter):
 ##//////////////////////////////////////////////////////
 
 
-class ProbDistI(metaclass=ABCMeta):
+@add_metaclass(ABCMeta)
+class ProbDistI(object):
     """
     A probability distribution for the outcomes of an experiment.  A
     probability distribution specifies how likely it is that an
@@ -586,7 +544,7 @@ class ProbDistI(metaclass=ABCMeta):
         return random.choice(list(self.samples()))
 
 
-
+@compat.python_2_unicode_compatible
 class UniformProbDist(ProbDistI):
     """
     A probability distribution that assigns equal probability to each
@@ -606,7 +564,7 @@ class UniformProbDist(ProbDistI):
         """
         if len(samples) == 0:
             raise ValueError(
-                "A Uniform probability distribution must " + "have at least one sample."
+                'A Uniform probability distribution must ' + 'have at least one sample.'
             )
         self._sampleset = set(samples)
         self._prob = 1.0 / len(self._sampleset)
@@ -622,10 +580,10 @@ class UniformProbDist(ProbDistI):
         return self._samples
 
     def __repr__(self):
-        return "<UniformProbDist with %d samples>" % len(self._sampleset)
-
+        return '<UniformProbDist with %d samples>' % len(self._sampleset)
 
 
+@compat.python_2_unicode_compatible
 class RandomProbDist(ProbDistI):
     """
     Generates a random probability distribution whereby each sample
@@ -636,7 +594,7 @@ class RandomProbDist(ProbDistI):
     def __init__(self, samples):
         if len(samples) == 0:
             raise ValueError(
-                "A probability distribution must " + "have at least one sample."
+                'A probability distribution must ' + 'have at least one sample.'
             )
         self._probs = self.unirand(samples)
         self._samples = list(self._probs.keys())
@@ -664,7 +622,7 @@ class RandomProbDist(ProbDistI):
         return dict((s, randrow[i]) for i, s in enumerate(samples))
 
     def max(self):
-        if not hasattr(self, "_max"):
+        if not hasattr(self, '_max'):
             self._max = max((p, v) for (v, p) in self._probs.items())[1]
         return self._max
 
@@ -675,10 +633,10 @@ class RandomProbDist(ProbDistI):
         return self._samples
 
     def __repr__(self):
-        return "<RandomUniformProbDist with %d samples>" % len(self._probs)
-
+        return '<RandomUniformProbDist with %d samples>' % len(self._probs)
 
 
+@compat.python_2_unicode_compatible
 class DictionaryProbDist(ProbDistI):
     """
     A probability distribution whose probabilities are directly
@@ -705,8 +663,8 @@ class DictionaryProbDist(ProbDistI):
         if normalize:
             if len(prob_dict) == 0:
                 raise ValueError(
-                    "A DictionaryProbDist must have at least one sample "
-                    + "before it can be normalized."
+                    'A DictionaryProbDist must have at least one sample '
+                    + 'before it can be normalized.'
                 )
             if log:
                 value_sum = sum_logs(list(self._prob_dict.values()))
@@ -746,7 +704,7 @@ class DictionaryProbDist(ProbDistI):
                 return math.log(self._prob_dict[sample], 2)
 
     def max(self):
-        if not hasattr(self, "_max"):
+        if not hasattr(self, '_max'):
             self._max = max((p, v) for (v, p) in self._prob_dict.items())[1]
         return self._max
 
@@ -754,10 +712,10 @@ class DictionaryProbDist(ProbDistI):
         return self._prob_dict.keys()
 
     def __repr__(self):
-        return "<ProbDist with %d samples>" % len(self._prob_dict)
-
+        return '<ProbDist with %d samples>' % len(self._prob_dict)
 
 
+@compat.python_2_unicode_compatible
 class MLEProbDist(ProbDistI):
     """
     The maximum likelihood estimate for the probability distribution
@@ -801,10 +759,10 @@ class MLEProbDist(ProbDistI):
         :rtype: str
         :return: A string representation of this ``ProbDist``.
         """
-        return "<MLEProbDist based on %d samples>" % self._freqdist.N()
-
+        return '<MLEProbDist based on %d samples>' % self._freqdist.N()
 
 
+@compat.python_2_unicode_compatible
 class LidstoneProbDist(ProbDistI):
     """
     The Lidstone estimate for the probability distribution of the
@@ -844,15 +802,15 @@ class LidstoneProbDist(ProbDistI):
         if (bins == 0) or (bins is None and freqdist.N() == 0):
             name = self.__class__.__name__[:-8]
             raise ValueError(
-                "A %s probability distribution " % name + "must have at least one bin."
+                'A %s probability distribution ' % name + 'must have at least one bin.'
             )
         if (bins is not None) and (bins < freqdist.B()):
             name = self.__class__.__name__[:-8]
             raise ValueError(
-                "\nThe number of bins in a %s distribution " % name
-                + "(%d) must be greater than or equal to\n" % bins
-                + "the number of bins in the FreqDist used "
-                + "to create it (%d)." % freqdist.B()
+                '\nThe number of bins in a %s distribution ' % name
+                + '(%d) must be greater than or equal to\n' % bins
+                + 'the number of bins in the FreqDist used '
+                + 'to create it (%d).' % freqdist.B()
             )
 
         self._freqdist = freqdist
@@ -902,10 +860,10 @@ class LidstoneProbDist(ProbDistI):
 
         :rtype: str
         """
-        return "<LidstoneProbDist based on %d samples>" % self._freqdist.N()
-
+        return '<LidstoneProbDist based on %d samples>' % self._freqdist.N()
 
 
+@compat.python_2_unicode_compatible
 class LaplaceProbDist(LidstoneProbDist):
     """
     The Laplace estimate for the probability distribution of the
@@ -939,10 +897,10 @@ class LaplaceProbDist(LidstoneProbDist):
         :rtype: str
         :return: A string representation of this ``ProbDist``.
         """
-        return "<LaplaceProbDist based on %d samples>" % self._freqdist.N()
-
+        return '<LaplaceProbDist based on %d samples>' % self._freqdist.N()
 
 
+@compat.python_2_unicode_compatible
 class ELEProbDist(LidstoneProbDist):
     """
     The expected likelihood estimate for the probability distribution
@@ -977,10 +935,10 @@ class ELEProbDist(LidstoneProbDist):
 
         :rtype: str
         """
-        return "<ELEProbDist based on %d samples>" % self._freqdist.N()
-
+        return '<ELEProbDist based on %d samples>' % self._freqdist.N()
 
 
+@compat.python_2_unicode_compatible
 class HeldoutProbDist(ProbDistI):
     """
     The heldout estimate for the probability distribution of the
@@ -1141,11 +1099,11 @@ class HeldoutProbDist(ProbDistI):
         :rtype: str
         :return: A string representation of this ``ProbDist``.
         """
-        s = "<HeldoutProbDist: %d base samples; %d heldout samples>"
+        s = '<HeldoutProbDist: %d base samples; %d heldout samples>'
         return s % (self._base_fdist.N(), self._heldout_fdist.N())
 
 
-
+@compat.python_2_unicode_compatible
 class CrossValidationProbDist(ProbDistI):
     """
     The cross-validation estimate for the probability distribution of
@@ -1213,10 +1171,10 @@ class CrossValidationProbDist(ProbDistI):
 
         :rtype: str
         """
-        return "<CrossValidationProbDist: %d-way>" % len(self._freqdists)
-
+        return '<CrossValidationProbDist: %d-way>' % len(self._freqdists)
 
 
+@compat.python_2_unicode_compatible
 class WittenBellProbDist(ProbDistI):
     """
     The Witten-Bell estimate of a probability distribution. This distribution
@@ -1260,7 +1218,7 @@ class WittenBellProbDist(ProbDistI):
         :type bins: int
         """
         assert bins is None or bins >= freqdist.B(), (
-            "bins parameter must not be less than %d=freqdist.B()" % freqdist.B()
+            'bins parameter must not be less than %d=freqdist.B()' % freqdist.B()
         )
         if bins is None:
             bins = freqdist.B()
@@ -1298,7 +1256,7 @@ class WittenBellProbDist(ProbDistI):
 
         :rtype: str
         """
-        return "<WittenBellProbDist based on %d samples>" % self._freqdist.N()
+        return '<WittenBellProbDist based on %d samples>' % self._freqdist.N()
 
 
 ##//////////////////////////////////////////////////////
@@ -1360,7 +1318,7 @@ class WittenBellProbDist(ProbDistI):
 ##//////////////////////////////////////////////////////
 
 
-
+@compat.python_2_unicode_compatible
 class SimpleGoodTuringProbDist(ProbDistI):
     """
     SimpleGoodTuring ProbDist approximates from frequency to frequency of
@@ -1395,7 +1353,7 @@ class SimpleGoodTuringProbDist(ProbDistI):
         """
         assert (
             bins is None or bins > freqdist.B()
-        ), "bins parameter must not be less than %d=freqdist.B()+1" % (freqdist.B() + 1)
+        ), 'bins parameter must not be less than %d=freqdist.B()+1' % (freqdist.B() + 1)
         if bins is None:
             bins = freqdist.B() + 1
         self._freqdist = freqdist
@@ -1454,10 +1412,10 @@ class SimpleGoodTuringProbDist(ProbDistI):
         self._slope = xy_cov / x_var if x_var != 0 else 0.0
         if self._slope >= -1:
             warnings.warn(
-                "SimpleGoodTuring did not find a proper best fit "
-                "line for smoothing probabilities of occurrences. "
-                "The probability estimates are likely to be "
-                "unreliable."
+                'SimpleGoodTuring did not find a proper best fit '
+                'line for smoothing probabilities of occurrences. '
+                'The probability estimates are likely to be '
+                'unreliable.'
             )
         self._intercept = y_mean - self._slope * x_mean
 
@@ -1581,7 +1539,7 @@ class SimpleGoodTuringProbDist(ProbDistI):
 
         :rtype: str
         """
-        return "<SimpleGoodTuringProbDist based on %d samples>" % self._freqdist.N()
+        return '<SimpleGoodTuringProbDist based on %d samples>' % self._freqdist.N()
 
 
 class MutableProbDist(ProbDistI):
@@ -1634,7 +1592,7 @@ class MutableProbDist(ProbDistI):
         # inherit documentation
         i = self._sample_dict.get(sample)
         if i is None:
-            return float("-inf")
+            return float('-inf')
         return self._data[i] if self._logs else math.log(self._data[i], 2)
 
     def update(self, sample, prob, log=True):
@@ -1696,7 +1654,7 @@ class MutableProbDist(ProbDistI):
 # where possible.
 
 
-
+@compat.python_2_unicode_compatible
 class KneserNeyProbDist(ProbDistI):
     """
     Kneser-Ney estimate of a probability distribution. This is a version of
@@ -1745,7 +1703,7 @@ class KneserNeyProbDist(ProbDistI):
     def prob(self, trigram):
         # sample must be a triple
         if len(trigram) != 3:
-            raise ValueError("Expected an iterable with 3 members.")
+            raise ValueError('Expected an iterable with 3 members.')
         trigram = tuple(trigram)
         w0, w1, w2 = trigram
 
@@ -1803,12 +1761,12 @@ class KneserNeyProbDist(ProbDistI):
         return self._trigrams.max()
 
     def __repr__(self):
-        """
+        '''
         Return a string representation of this ProbDist
 
         :rtype: str
-        """
-        return "<KneserNeyProbDist based on {0} trigrams".format(self._trigrams.N())
+        '''
+        return '<KneserNeyProbDist based on {0} trigrams'.format(self._trigrams.N())
 
 
 ##//////////////////////////////////////////////////////
@@ -1818,7 +1776,7 @@ class KneserNeyProbDist(ProbDistI):
 
 def log_likelihood(test_pdist, actual_pdist):
     if not isinstance(test_pdist, ProbDistI) or not isinstance(actual_pdist, ProbDistI):
-        raise ValueError("expected a ProbDist.")
+        raise ValueError('expected a ProbDist.')
     # Is this right?
     return sum(
         actual_pdist.prob(s) * math.log(test_pdist.prob(s), 2) for s in actual_pdist
@@ -1835,7 +1793,7 @@ def entropy(pdist):
 ##//////////////////////////////////////////////////////
 
 
-
+@compat.python_2_unicode_compatible
 class ConditionalFreqDist(defaultdict):
     """
     A collection of frequency distributions for a single experiment
@@ -1923,7 +1881,7 @@ class ConditionalFreqDist(defaultdict):
 
         :rtype: int
         """
-        return sum(fdist.N() for fdist in self.values())
+        return sum(fdist.N() for fdist in itervalues(self))
 
     def plot(self, *args, **kwargs):
         """
@@ -1939,58 +1897,47 @@ class ConditionalFreqDist(defaultdict):
         :type conditions: list
         """
         try:
-            import matplotlib.pyplot as plt #import statment fix
+            from matplotlib import pylab
         except ImportError:
             raise ValueError(
-                "The plot function requires matplotlib to be installed."
-                "See http://matplotlib.org/"
+                'The plot function requires matplotlib to be installed.'
+                'See http://matplotlib.org/'
             )
 
         cumulative = _get_kwarg(kwargs, 'cumulative', False)
         percents = _get_kwarg(kwargs, 'percents', False)
-        conditions = [c for c in _get_kwarg(kwargs, 'conditions', self.conditions()) if c in self] # conditions should be in self
+        conditions = _get_kwarg(kwargs, 'conditions', sorted(self.conditions()))
         title = _get_kwarg(kwargs, 'title', '')
         samples = _get_kwarg(
-            kwargs, 'samples', sorted(set(v 
-                                            for c in conditions
-                                            for v in self[c]))
+            kwargs, 'samples', sorted(set(v for c in conditions for v in self[c]))
         )  # this computation could be wasted
         if "linewidth" not in kwargs:
             kwargs["linewidth"] = 2
-        ax = plt.gca()
-        if (len(conditions) != 0):
-            freqs = []
-            for condition in conditions:
-                if cumulative:
-                    # freqs should be a list of list where each sub list will be a frequency of a condition
-                    freqs.append(list(self[condition]._cumulative_frequencies(samples)))
-                    ylabel = "Cumulative Counts"
-                    legend_loc = 'lower right'
-                    if percents:
-                        freqs[-1] = [f / freqs[len(freqs) - 1] * 100 for f in freqs]
-                        ylabel = "Cumulative Percents"
-                else:
-                    freqs.append([self[condition][sample] for sample in samples])
-                    ylabel = "Counts"
-                    legend_loc = 'upper right'
-                # percents = [f * 100 for f in freqs] only in ConditionalProbDist?
-
-            i = 0
-            for freq in freqs:
-                kwargs['label'] = conditions[i] #label for each condition
-                i += 1
-                ax.plot(freq, *args, **kwargs)
-            ax.legend(loc=legend_loc)
-            ax.grid(True, color="silver")
-            ax.set_xticks(range(len(samples)))
-            ax.set_xticklabels([str(s) for s in samples], rotation=90)
-            if title:
-                ax.set_title(title)
-            ax.set_xlabel("Samples")
-            ax.set_ylabel(ylabel)
-        plt.show()
-
-        return ax
+
+        for condition in conditions:
+            if cumulative:
+                freqs = list(self[condition]._cumulative_frequencies(samples))
+                ylabel = "Cumulative Counts"
+                legend_loc = 'lower right'
+                if percents:
+                    freqs = [f / freqs[len(freqs) - 1] * 100 for f in freqs]
+                    ylabel = "Cumulative Percents"
+            else:
+                freqs = [self[condition][sample] for sample in samples]
+                ylabel = "Counts"
+                legend_loc = 'upper right'
+            # percents = [f * 100 for f in freqs] only in ConditionalProbDist?
+            kwargs['label'] = "%s" % condition
+            pylab.plot(freqs, *args, **kwargs)
+
+        pylab.legend(loc=legend_loc)
+        pylab.grid(True, color="silver")
+        pylab.xticks(range(len(samples)), [text_type(s) for s in samples], rotation=90)
+        if title:
+            pylab.title(title)
+        pylab.xlabel("Samples")
+        pylab.ylabel(ylabel)
+        pylab.show()
 
     def tabulate(self, *args, **kwargs):
         """
@@ -2004,12 +1951,10 @@ class ConditionalFreqDist(defaultdict):
         :type title: bool
         """
 
-        cumulative = _get_kwarg(kwargs, "cumulative", False)
-        conditions = _get_kwarg(kwargs, "conditions", sorted(self.conditions()))
+        cumulative = _get_kwarg(kwargs, 'cumulative', False)
+        conditions = _get_kwarg(kwargs, 'conditions', sorted(self.conditions()))
         samples = _get_kwarg(
-            kwargs,
-            "samples",
-            sorted(set(v for c in conditions if c in self for v in self[c])),
+            kwargs, 'samples', sorted(set(v for c in conditions for v in self[c]))
         )  # this computation could be wasted
 
         width = max(len("%s" % s) for s in samples)
@@ -2022,14 +1967,14 @@ class ConditionalFreqDist(defaultdict):
             width = max(width, max(len("%d" % f) for f in freqs[c]))
 
         condition_size = max(len("%s" % c) for c in conditions)
-        print(" " * condition_size, end=" ")
+        print(' ' * condition_size, end=' ')
         for s in samples:
-            print("%*s" % (width, s), end=" ")
+            print("%*s" % (width, s), end=' ')
         print()
         for c in conditions:
-            print("%*s" % (condition_size, c), end=" ")
+            print("%*s" % (condition_size, c), end=' ')
             for f in freqs[c]:
-                print("%*d" % (width, f), end=" ")
+                print("%*d" % (width, f), end=' ')
             print()
 
     # Mathematical operators
@@ -2130,11 +2075,12 @@ class ConditionalFreqDist(defaultdict):
 
         :rtype: str
         """
-        return "<ConditionalFreqDist with %d conditions>" % len(self)
-
+        return '<ConditionalFreqDist with %d conditions>' % len(self)
 
 
-class ConditionalProbDistI(dict, metaclass=ABCMeta):
+@compat.python_2_unicode_compatible
+@add_metaclass(ABCMeta)
+class ConditionalProbDistI(dict):
     """
     A collection of probability distributions for a single experiment
     run under different conditions.  Conditional probability
@@ -2170,7 +2116,7 @@ class ConditionalProbDistI(dict, metaclass=ABCMeta):
 
         :rtype: str
         """
-        return "<%s with %d conditions>" % (type(self).__name__, len(self))
+        return '<%s with %d conditions>' % (type(self).__name__, len(self))
 
 
 class ConditionalProbDist(ConditionalProbDistI):
@@ -2336,13 +2282,13 @@ class ProbabilisticMixIn(object):
             the object.
         :type logprob: float
         """
-        if "prob" in kwargs:
-            if "logprob" in kwargs:
-                raise TypeError("Must specify either prob or logprob " "(not both)")
+        if 'prob' in kwargs:
+            if 'logprob' in kwargs:
+                raise TypeError('Must specify either prob or logprob ' '(not both)')
             else:
-                ProbabilisticMixIn.set_prob(self, kwargs["prob"])
-        elif "logprob" in kwargs:
-            ProbabilisticMixIn.set_logprob(self, kwargs["logprob"])
+                ProbabilisticMixIn.set_prob(self, kwargs['prob'])
+        elif 'logprob' in kwargs:
+            ProbabilisticMixIn.set_logprob(self, kwargs['logprob'])
         else:
             self.__prob = self.__logprob = None
 
@@ -2396,10 +2342,10 @@ class ProbabilisticMixIn(object):
 
 class ImmutableProbabilisticMixIn(ProbabilisticMixIn):
     def set_prob(self, prob):
-        raise ValueError("%s is immutable" % self.__class__.__name__)
+        raise ValueError('%s is immutable' % self.__class__.__name__)
 
     def set_logprob(self, prob):
-        raise ValueError("%s is immutable" % self.__class__.__name__)
+        raise ValueError('%s is immutable' % self.__class__.__name__)
 
 
 ## Helper function for processing keyword arguments
@@ -2425,6 +2371,7 @@ def _create_rand_fdist(numsamples, numoutcomes):
     samples are numbers from 1 to ``numsamples``, and are generated by
     summing two numbers, each of which has a uniform distribution.
     """
+    import random
 
     fdist = FreqDist()
     for x in range(numoutcomes):
@@ -2493,82 +2440,82 @@ def demo(numsamples=6, numoutcomes=500):
     # Print the results in a formatted table.
     print(
         (
-            "%d samples (1-%d); %d outcomes were sampled for each FreqDist"
+            '%d samples (1-%d); %d outcomes were sampled for each FreqDist'
             % (numsamples, numsamples, numoutcomes)
         )
     )
-    print("=" * 9 * (len(pdists) + 2))
-    FORMATSTR = "      FreqDist " + "%8s " * (len(pdists) - 1) + "|  Actual"
+    print('=' * 9 * (len(pdists) + 2))
+    FORMATSTR = '      FreqDist ' + '%8s ' * (len(pdists) - 1) + '|  Actual'
     print(FORMATSTR % tuple(repr(pdist)[1:9] for pdist in pdists[:-1]))
-    print("-" * 9 * (len(pdists) + 2))
-    FORMATSTR = "%3d   %8.6f " + "%8.6f " * (len(pdists) - 1) + "| %8.6f"
+    print('-' * 9 * (len(pdists) + 2))
+    FORMATSTR = '%3d   %8.6f ' + '%8.6f ' * (len(pdists) - 1) + '| %8.6f'
     for val in vals:
         print(FORMATSTR % val)
 
     # Print the totals for each column (should all be 1.0)
     zvals = list(zip(*vals))
     sums = [sum(val) for val in zvals[1:]]
-    print("-" * 9 * (len(pdists) + 2))
-    FORMATSTR = "Total " + "%8.6f " * (len(pdists)) + "| %8.6f"
+    print('-' * 9 * (len(pdists) + 2))
+    FORMATSTR = 'Total ' + '%8.6f ' * (len(pdists)) + '| %8.6f'
     print(FORMATSTR % tuple(sums))
-    print("=" * 9 * (len(pdists) + 2))
+    print('=' * 9 * (len(pdists) + 2))
 
     # Display the distributions themselves, if they're short enough.
     if len("%s" % fdist1) < 70:
-        print("  fdist1: %s" % fdist1)
-        print("  fdist2: %s" % fdist2)
-        print("  fdist3: %s" % fdist3)
+        print('  fdist1: %s' % fdist1)
+        print('  fdist2: %s' % fdist2)
+        print('  fdist3: %s' % fdist3)
     print()
 
-    print("Generating:")
+    print('Generating:')
     for pdist in pdists:
         fdist = FreqDist(pdist.generate() for i in range(5000))
-        print("%20s %s" % (pdist.__class__.__name__[:20], ("%s" % fdist)[:55]))
+        print('%20s %s' % (pdist.__class__.__name__[:20], ("%s" % fdist)[:55]))
     print()
 
 
 def gt_demo():
     from nltk import corpus
 
-    emma_words = corpus.gutenberg.words("austen-emma.txt")
+    emma_words = corpus.gutenberg.words('austen-emma.txt')
     fd = FreqDist(emma_words)
     sgt = SimpleGoodTuringProbDist(fd)
-    print("%18s %8s  %14s" % ("word", "freqency", "SimpleGoodTuring"))
+    print('%18s %8s  %14s' % ("word", "freqency", "SimpleGoodTuring"))
     fd_keys_sorted = (
         key for key, value in sorted(fd.items(), key=lambda item: item[1], reverse=True)
     )
     for key in fd_keys_sorted:
-        print("%18s %8d  %14e" % (key, fd[key], sgt.prob(key)))
+        print('%18s %8d  %14e' % (key, fd[key], sgt.prob(key)))
 
 
-if __name__ == "__main__":
+if __name__ == '__main__':
     demo(6, 10)
     demo(5, 5000)
     gt_demo()
 
 __all__ = [
-    "ConditionalFreqDist",
-    "ConditionalProbDist",
-    "ConditionalProbDistI",
-    "CrossValidationProbDist",
-    "DictionaryConditionalProbDist",
-    "DictionaryProbDist",
-    "ELEProbDist",
-    "FreqDist",
-    "SimpleGoodTuringProbDist",
-    "HeldoutProbDist",
-    "ImmutableProbabilisticMixIn",
-    "LaplaceProbDist",
-    "LidstoneProbDist",
-    "MLEProbDist",
-    "MutableProbDist",
-    "KneserNeyProbDist",
-    "ProbDistI",
-    "ProbabilisticMixIn",
-    "UniformProbDist",
-    "WittenBellProbDist",
-    "add_logs",
-    "log_likelihood",
-    "sum_logs",
-    "entropy",
+    'ConditionalFreqDist',
+    'ConditionalProbDist',
+    'ConditionalProbDistI',
+    'CrossValidationProbDist',
+    'DictionaryConditionalProbDist',
+    'DictionaryProbDist',
+    'ELEProbDist',
+    'FreqDist',
+    'SimpleGoodTuringProbDist',
+    'HeldoutProbDist',
+    'ImmutableProbabilisticMixIn',
+    'LaplaceProbDist',
+    'LidstoneProbDist',
+    'MLEProbDist',
+    'MutableProbDist',
+    'KneserNeyProbDist',
+    'ProbDistI',
+    'ProbabilisticMixIn',
+    'UniformProbDist',
+    'WittenBellProbDist',
+    'add_logs',
+    'log_likelihood',
+    'sum_logs',
+    'entropy',
 ]
index bc2bca4..2d60761 100644 (file)
@@ -1,6 +1,6 @@
 # Natural Language Toolkit: Semantic Interpretation
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Ewan Klein <ewan@inf.ed.ac.uk>
 # URL: <http://nltk.org/>
 # For license information, see LICENSE.TXT
index bc69cda..1551347 100644 (file)
Binary files a/nlp_resource_data/nltk/sem/__pycache__/__init__.cpython-37.pyc and b/nlp_resource_data/nltk/sem/__pycache__/__init__.cpython-37.pyc differ
index d3a4ad2..854ba8d 100644 (file)
Binary files a/nlp_resource_data/nltk/sem/__pycache__/boxer.cpython-37.pyc and b/nlp_resource_data/nltk/sem/__pycache__/boxer.cpython-37.pyc differ
index 45a022e..3977609 100644 (file)
Binary files a/nlp_resource_data/nltk/sem/__pycache__/chat80.cpython-37.pyc and b/nlp_resource_data/nltk/sem/__pycache__/chat80.cpython-37.pyc differ
index fd34437..9206ce7 100644 (file)
Binary files a/nlp_resource_data/nltk/sem/__pycache__/cooper_storage.cpython-37.pyc and b/nlp_resource_data/nltk/sem/__pycache__/cooper_storage.cpython-37.pyc differ
index fb72736..ac0a3bd 100644 (file)
Binary files a/nlp_resource_data/nltk/sem/__pycache__/drt.cpython-37.pyc and b/nlp_resource_data/nltk/sem/__pycache__/drt.cpython-37.pyc differ
index 33f6c6e..44ddf89 100644 (file)
Binary files a/nlp_resource_data/nltk/sem/__pycache__/drt_glue_demo.cpython-37.pyc and b/nlp_resource_data/nltk/sem/__pycache__/drt_glue_demo.cpython-37.pyc differ
index b212373..18e5a8b 100644 (file)
Binary files a/nlp_resource_data/nltk/sem/__pycache__/evaluate.cpython-37.pyc and b/nlp_resource_data/nltk/sem/__pycache__/evaluate.cpython-37.pyc differ
index c722470..28c2466 100644 (file)
Binary files a/nlp_resource_data/nltk/sem/__pycache__/glue.cpython-37.pyc and b/nlp_resource_data/nltk/sem/__pycache__/glue.cpython-37.pyc differ
index 95c6642..639e230 100644 (file)
Binary files a/nlp_resource_data/nltk/sem/__pycache__/hole.cpython-37.pyc and b/nlp_resource_data/nltk/sem/__pycache__/hole.cpython-37.pyc differ
index a5b8834..fad5ba5 100644 (file)
Binary files a/nlp_resource_data/nltk/sem/__pycache__/lfg.cpython-37.pyc and b/nlp_resource_data/nltk/sem/__pycache__/lfg.cpython-37.pyc differ
index 2cf3401..fb58cb4 100644 (file)
Binary files a/nlp_resource_data/nltk/sem/__pycache__/linearlogic.cpython-37.pyc and b/nlp_resource_data/nltk/sem/__pycache__/linearlogic.cpython-37.pyc differ
index 5089632..436e402 100644 (file)
Binary files a/nlp_resource_data/nltk/sem/__pycache__/logic.cpython-37.pyc and b/nlp_resource_data/nltk/sem/__pycache__/logic.cpython-37.pyc differ
index 4f0b2a1..9f313f7 100644 (file)
Binary files a/nlp_resource_data/nltk/sem/__pycache__/relextract.cpython-37.pyc and b/nlp_resource_data/nltk/sem/__pycache__/relextract.cpython-37.pyc differ
index 95e789a..2e6f730 100644 (file)
Binary files a/nlp_resource_data/nltk/sem/__pycache__/skolemize.cpython-37.pyc and b/nlp_resource_data/nltk/sem/__pycache__/skolemize.cpython-37.pyc differ
index fc5fcd4..6878924 100644 (file)
Binary files a/nlp_resource_data/nltk/sem/__pycache__/util.cpython-37.pyc and b/nlp_resource_data/nltk/sem/__pycache__/util.cpython-37.pyc differ
index bc87dab..8113165 100644 (file)
@@ -3,7 +3,7 @@
 #
 # Author: Dan Garrette <dhgarrette@gmail.com>
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # URL: <http://nltk.org/>
 # For license information, see LICENSE.TXT
 
@@ -24,6 +24,7 @@ Usage:
         models/
             boxer/
 """
+from __future__ import print_function, unicode_literals
 
 import os
 import re
@@ -54,6 +55,8 @@ from nltk.sem.drt import (
     DrtVariableExpression,
 )
 
+from nltk.compat import python_2_unicode_compatible
+
 
 class Boxer(object):
     """
@@ -90,11 +93,11 @@ class Boxer(object):
         self.set_bin_dir(bin_dir, verbose)
 
     def set_bin_dir(self, bin_dir, verbose=False):
-        self._candc_bin = self._find_binary("candc", bin_dir, verbose)
+        self._candc_bin = self._find_binary('candc', bin_dir, verbose)
         self._candc_models_path = os.path.normpath(
-            os.path.join(self._candc_bin[:-5], "../models")
+            os.path.join(self._candc_bin[:-5], '../models')
         )
-        self._boxer_bin = self._find_binary("boxer", bin_dir, verbose)
+        self._boxer_bin = self._find_binary('boxer', bin_dir, verbose)
 
     def interpret(self, input, discourse_id=None, question=False, verbose=False):
         """
@@ -179,13 +182,13 @@ class Boxer(object):
         :return: stdout
         """
         args = [
-            "--models",
-            os.path.join(self._candc_models_path, ["boxer", "questions"][question]),
-            "--candc-printer",
-            "boxer",
+            '--models',
+            os.path.join(self._candc_models_path, ['boxer', 'questions'][question]),
+            '--candc-printer',
+            'boxer',
         ]
         return self._call(
-            "\n".join(
+            '\n'.join(
                 sum(
                     (
                         ["<META>'{0}'".format(id)] + d
@@ -209,29 +212,29 @@ class Boxer(object):
         f = None
         try:
             fd, temp_filename = tempfile.mkstemp(
-                prefix="boxer-", suffix=".in", text=True
+                prefix='boxer-', suffix='.in', text=True
             )
-            f = os.fdopen(fd, "w")
+            f = os.fdopen(fd, 'w')
             f.write(candc_out)
         finally:
             if f:
                 f.close()
 
         args = [
-            "--box",
-            "false",
-            "--semantics",
-            "drs",
+            '--box',
+            'false',
+            '--semantics',
+            'drs',
             #'--flat', 'false', # removed from boxer
-            "--resolve",
-            ["false", "true"][self._resolve],
-            "--elimeq",
-            ["false", "true"][self._elimeq],
-            "--format",
-            "prolog",
-            "--instantiate",
-            "true",
-            "--input",
+            '--resolve',
+            ['false', 'true'][self._resolve],
+            '--elimeq',
+            ['false', 'true'][self._elimeq],
+            '--format',
+            'prolog',
+            '--instantiate',
+            'true',
+            '--input',
             temp_filename,
         ]
         stdout = self._call(None, self._boxer_bin, args, verbose)
@@ -242,9 +245,9 @@ class Boxer(object):
         return find_binary(
             name,
             path_to_bin=bin_dir,
-            env_vars=["CANDC"],
-            url="http://svn.ask.it.usyd.edu.au/trac/candc/",
-            binary_names=[name, name + ".exe"],
+            env_vars=['CANDC'],
+            url='http://svn.ask.it.usyd.edu.au/trac/candc/',
+            binary_names=[name, name + '.exe'],
             verbose=verbose,
         )
 
@@ -258,63 +261,63 @@ class Boxer(object):
         :return: stdout
         """
         if verbose:
-            print("Calling:", binary)
-            print("Args:", args)
-            print("Input:", input_str)
-            print("Command:", binary + " " + " ".join(args))
+            print('Calling:', binary)
+            print('Args:', args)
+            print('Input:', input_str)
+            print('Command:', binary + ' ' + ' '.join(args))
 
         # Call via a subprocess
         if input_str is None:
             cmd = [binary] + args
             p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
         else:
-            cmd = 'echo "{0}" | {1} {2}'.format(input_str, binary, " ".join(args))
+            cmd = 'echo "{0}" | {1} {2}'.format(input_str, binary, ' '.join(args))
             p = subprocess.Popen(
                 cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True
             )
         stdout, stderr = p.communicate()
 
         if verbose:
-            print("Return code:", p.returncode)
+            print('Return code:', p.returncode)
             if stdout:
-                print("stdout:\n", stdout, "\n")
+                print('stdout:\n', stdout, '\n')
             if stderr:
-                print("stderr:\n", stderr, "\n")
+                print('stderr:\n', stderr, '\n')
         if p.returncode != 0:
             raise Exception(
-                "ERROR CALLING: {0} {1}\nReturncode: {2}\n{3}".format(
-                    binary, " ".join(args), p.returncode, stderr
+                'ERROR CALLING: {0} {1}\nReturncode: {2}\n{3}'.format(
+                    binary, ' '.join(args), p.returncode, stderr
                 )
             )
 
         return stdout
 
     def _parse_to_drs_dict(self, boxer_out, use_disc_id):
-        lines = boxer_out.split("\n")
+        lines = boxer_out.split('\n')
         drs_dict = {}
         i = 0
         while i < len(lines):
             line = lines[i]
-            if line.startswith("id("):
-                comma_idx = line.index(",")
+            if line.startswith('id('):
+                comma_idx = line.index(',')
                 discourse_id = line[3:comma_idx]
                 if discourse_id[0] == "'" and discourse_id[-1] == "'":
                     discourse_id = discourse_id[1:-1]
-                drs_id = line[comma_idx + 1 : line.index(")")]
+                drs_id = line[comma_idx + 1 : line.index(')')]
                 i += 1
                 line = lines[i]
-                assert line.startswith("sem({0},".format(drs_id))
+                assert line.startswith('sem({0},'.format(drs_id))
                 if line[-4:] == "').'":
                     line = line[:-4] + ")."
-                assert line.endswith(")."), "can't parse line: {0}".format(line)
+                assert line.endswith(').'), "can't parse line: {0}".format(line)
 
-                search_start = len("sem({0},[".format(drs_id))
+                search_start = len('sem({0},['.format(drs_id))
                 brace_count = 1
                 drs_start = -1
                 for j, c in enumerate(line[search_start:]):
-                    if c == "[":
+                    if c == '[':
                         brace_count += 1
-                    if c == "]":
+                    if c == ']':
                         brace_count -= 1
                         if brace_count == 0:
                             drs_start = search_start + j + 1
@@ -350,7 +353,7 @@ class BoxerOutputDrsParser(DrtParser):
         return DrtParser.parse(self, data, signature)
 
     def get_all_symbols(self):
-        return ["(", ")", ",", "[", "]", ":"]
+        return ['(', ')', ',', '[', ']', ':']
 
     def handle(self, tok, context):
         return self.handle_drs(tok)
@@ -371,11 +374,11 @@ class BoxerOutputDrsParser(DrtParser):
         return accum
 
     def handle_drs(self, tok):
-        if tok == "drs":
+        if tok == 'drs':
             return self.parse_drs()
-        elif tok in ["merge", "smerge"]:
+        elif tok in ['merge', 'smerge']:
             return self._handle_binary_expression(self._make_merge_expression)(None, [])
-        elif tok in ["alfa"]:
+        elif tok in ['alfa']:
             return self._handle_alfa(self._make_merge_expression)(None, [])
 
     def handle_condition(self, tok, indices):
@@ -385,32 +388,32 @@ class BoxerOutputDrsParser(DrtParser):
         :param indices: list of int
         :return: list of ``DrtExpression``
         """
-        if tok == "not":
+        if tok == 'not':
             return [self._handle_not()]
 
-        if tok == "or":
+        if tok == 'or':
             conds = [self._handle_binary_expression(self._make_or_expression)]
-        elif tok == "imp":
+        elif tok == 'imp':
             conds = [self._handle_binary_expression(self._make_imp_expression)]
-        elif tok == "eq":
+        elif tok == 'eq':
             conds = [self._handle_eq()]
-        elif tok == "prop":
+        elif tok == 'prop':
             conds = [self._handle_prop()]
 
-        elif tok == "pred":
+        elif tok == 'pred':
             conds = [self._handle_pred()]
-        elif tok == "named":
+        elif tok == 'named':
             conds = [self._handle_named()]
-        elif tok == "rel":
+        elif tok == 'rel':
             conds = [self._handle_rel()]
-        elif tok == "timex":
+        elif tok == 'timex':
             conds = self._handle_timex()
-        elif tok == "card":
+        elif tok == 'card':
             conds = [self._handle_card()]
 
-        elif tok == "whq":
+        elif tok == 'whq':
             conds = [self._handle_whq()]
-        elif tok == "duplex":
+        elif tok == 'duplex':
             conds = [self._handle_duplex()]
 
         else:
@@ -425,22 +428,22 @@ class BoxerOutputDrsParser(DrtParser):
         )
 
     def _handle_not(self):
-        self.assertToken(self.token(), "(")
+        self.assertToken(self.token(), '(')
         drs = self.process_next_expression(None)
-        self.assertToken(self.token(), ")")
+        self.assertToken(self.token(), ')')
         return BoxerNot(drs)
 
     def _handle_pred(self):
         # pred(_G3943, dog, n, 0)
-        self.assertToken(self.token(), "(")
+        self.assertToken(self.token(), '(')
         variable = self.parse_variable()
-        self.assertToken(self.token(), ",")
+        self.assertToken(self.token(), ',')
         name = self.token()
-        self.assertToken(self.token(), ",")
+        self.assertToken(self.token(), ',')
         pos = self.token()
-        self.assertToken(self.token(), ",")
+        self.assertToken(self.token(), ',')
         sense = int(self.token())
-        self.assertToken(self.token(), ")")
+        self.assertToken(self.token(), ')')
 
         def _handle_pred_f(sent_index, word_indices):
             return BoxerPred(
@@ -451,7 +454,7 @@ class BoxerOutputDrsParser(DrtParser):
 
     def _handle_duplex(self):
         # duplex(whq, drs(...), var, drs(...))
-        self.assertToken(self.token(), "(")
+        self.assertToken(self.token(), '(')
         # self.assertToken(self.token(), '[')
         ans_types = []
         # while self.token(0) != ']':
@@ -470,71 +473,71 @@ class BoxerOutputDrsParser(DrtParser):
         #         ans_types.append(self.token())
         # self.token() #swallow the ']'
 
-        self.assertToken(self.token(), "whq")
-        self.assertToken(self.token(), ",")
+        self.assertToken(self.token(), 'whq')
+        self.assertToken(self.token(), ',')
         d1 = self.process_next_expression(None)
-        self.assertToken(self.token(), ",")
+        self.assertToken(self.token(), ',')
         ref = self.parse_variable()
-        self.assertToken(self.token(), ",")
+        self.assertToken(self.token(), ',')
         d2 = self.process_next_expression(None)
-        self.assertToken(self.token(), ")")
+        self.assertToken(self.token(), ')')
         return lambda sent_index, word_indices: BoxerWhq(
             self.discourse_id, sent_index, word_indices, ans_types, d1, ref, d2
         )
 
     def _handle_named(self):
         # named(x0, john, per, 0)
-        self.assertToken(self.token(), "(")
+        self.assertToken(self.token(), '(')
         variable = self.parse_variable()
-        self.assertToken(self.token(), ",")
+        self.assertToken(self.token(), ',')
         name = self.token()
-        self.assertToken(self.token(), ",")
+        self.assertToken(self.token(), ',')
         type = self.token()
-        self.assertToken(self.token(), ",")
+        self.assertToken(self.token(), ',')
         sense = self.token()  # as per boxer rev 2554
-        self.assertToken(self.token(), ")")
+        self.assertToken(self.token(), ')')
         return lambda sent_index, word_indices: BoxerNamed(
             self.discourse_id, sent_index, word_indices, variable, name, type, sense
         )
 
     def _handle_rel(self):
         # rel(_G3993, _G3943, agent, 0)
-        self.assertToken(self.token(), "(")
+        self.assertToken(self.token(), '(')
         var1 = self.parse_variable()
-        self.assertToken(self.token(), ",")
+        self.assertToken(self.token(), ',')
         var2 = self.parse_variable()
-        self.assertToken(self.token(), ",")
+        self.assertToken(self.token(), ',')
         rel = self.token()
-        self.assertToken(self.token(), ",")
+        self.assertToken(self.token(), ',')
         sense = int(self.token())
-        self.assertToken(self.token(), ")")
+        self.assertToken(self.token(), ')')
         return lambda sent_index, word_indices: BoxerRel(
             self.discourse_id, sent_index, word_indices, var1, var2, rel, sense
         )
 
     def _handle_timex(self):
         # timex(_G18322, date([]: (+), []:'XXXX', [1004]:'04', []:'XX'))
-        self.assertToken(self.token(), "(")
+        self.assertToken(self.token(), '(')
         arg = self.parse_variable()
-        self.assertToken(self.token(), ",")
+        self.assertToken(self.token(), ',')
         new_conds = self._handle_time_expression(arg)
-        self.assertToken(self.token(), ")")
+        self.assertToken(self.token(), ')')
         return new_conds
 
     def _handle_time_expression(self, arg):
         # date([]: (+), []:'XXXX', [1004]:'04', []:'XX')
         tok = self.token()
-        self.assertToken(self.token(), "(")
-        if tok == "date":
+        self.assertToken(self.token(), '(')
+        if tok == 'date':
             conds = self._handle_date(arg)
-        elif tok == "time":
+        elif tok == 'time':
             conds = self._handle_time(arg)
         else:
             return None
-        self.assertToken(self.token(), ")")
+        self.assertToken(self.token(), ')')
         return [
             lambda sent_index, word_indices: BoxerPred(
-                self.discourse_id, sent_index, word_indices, arg, tok, "n", 0
+                self.discourse_id, sent_index, word_indices, arg, tok, 'n', 0
             )
         ] + [lambda sent_index, word_indices: cond for cond in conds]
 
@@ -544,72 +547,72 @@ class BoxerOutputDrsParser(DrtParser):
         (sent_index, word_indices), = self._sent_and_word_indices(
             self._parse_index_list()
         )
-        self.assertToken(self.token(), "(")
+        self.assertToken(self.token(), '(')
         pol = self.token()
-        self.assertToken(self.token(), ")")
+        self.assertToken(self.token(), ')')
         conds.append(
             BoxerPred(
                 self.discourse_id,
                 sent_index,
                 word_indices,
                 arg,
-                "date_pol_{0}".format(pol),
-                "a",
+                'date_pol_{0}'.format(pol),
+                'a',
                 0,
             )
         )
-        self.assertToken(self.token(), ",")
+        self.assertToken(self.token(), ',')
 
         (sent_index, word_indices), = self._sent_and_word_indices(
             self._parse_index_list()
         )
         year = self.token()
-        if year != "XXXX":
-            year = year.replace(":", "_")
+        if year != 'XXXX':
+            year = year.replace(':', '_')
             conds.append(
                 BoxerPred(
                     self.discourse_id,
                     sent_index,
                     word_indices,
                     arg,
-                    "date_year_{0}".format(year),
-                    "a",
+                    'date_year_{0}'.format(year),
+                    'a',
                     0,
                 )
             )
-        self.assertToken(self.token(), ",")
+        self.assertToken(self.token(), ',')
 
         (sent_index, word_indices), = self._sent_and_word_indices(
             self._parse_index_list()
         )
         month = self.token()
-        if month != "XX":
+        if month != 'XX':
             conds.append(
                 BoxerPred(
                     self.discourse_id,
                     sent_index,
                     word_indices,
                     arg,
-                    "date_month_{0}".format(month),
-                    "a",
+                    'date_month_{0}'.format(month),
+                    'a',
                     0,
                 )
             )
-        self.assertToken(self.token(), ",")
+        self.assertToken(self.token(), ',')
 
         (sent_index, word_indices), = self._sent_and_word_indices(
             self._parse_index_list()
         )
         day = self.token()
-        if day != "XX":
+        if day != 'XX':
             conds.append(
                 BoxerPred(
                     self.discourse_id,
                     sent_index,
                     word_indices,
                     arg,
-                    "date_day_{0}".format(day),
-                    "a",
+                    'date_day_{0}'.format(day),
+                    'a',
                     0,
                 )
             )
@@ -621,43 +624,43 @@ class BoxerOutputDrsParser(DrtParser):
         conds = []
         self._parse_index_list()
         hour = self.token()
-        if hour != "XX":
-            conds.append(self._make_atom("r_hour_2", arg, hour))
-        self.assertToken(self.token(), ",")
+        if hour != 'XX':
+            conds.append(self._make_atom('r_hour_2', arg, hour))
+        self.assertToken(self.token(), ',')
 
         self._parse_index_list()
         min = self.token()
-        if min != "XX":
-            conds.append(self._make_atom("r_min_2", arg, min))
-        self.assertToken(self.token(), ",")
+        if min != 'XX':
+            conds.append(self._make_atom('r_min_2', arg, min))
+        self.assertToken(self.token(), ',')
 
         self._parse_index_list()
         sec = self.token()
-        if sec != "XX":
-            conds.append(self._make_atom("r_sec_2", arg, sec))
+        if sec != 'XX':
+            conds.append(self._make_atom('r_sec_2', arg, sec))
 
         return conds
 
     def _handle_card(self):
         # card(_G18535, 28, ge)
-        self.assertToken(self.token(), "(")
+        self.assertToken(self.token(), '(')
         variable = self.parse_variable()
-        self.assertToken(self.token(), ",")
+        self.assertToken(self.token(), ',')
         value = self.token()
-        self.assertToken(self.token(), ",")
+        self.assertToken(self.token(), ',')
         type = self.token()
-        self.assertToken(self.token(), ")")
+        self.assertToken(self.token(), ')')
         return lambda sent_index, word_indices: BoxerCard(
             self.discourse_id, sent_index, word_indices, variable, value, type
         )
 
     def _handle_prop(self):
         # prop(_G15949, drs(...))
-        self.assertToken(self.token(), "(")
+        self.assertToken(self.token(), '(')
         variable = self.parse_variable()
-        self.assertToken(self.token(), ",")
+        self.assertToken(self.token(), ',')
         drs = self.process_next_expression(None)
-        self.assertToken(self.token(), ")")
+        self.assertToken(self.token(), ')')
         return lambda sent_index, word_indices: BoxerProp(
             self.discourse_id, sent_index, word_indices, variable, drs
         )
@@ -665,99 +668,99 @@ class BoxerOutputDrsParser(DrtParser):
     def _parse_index_list(self):
         # [1001,1002]:
         indices = []
-        self.assertToken(self.token(), "[")
-        while self.token(0) != "]":
+        self.assertToken(self.token(), '[')
+        while self.token(0) != ']':
             indices.append(self.parse_index())
-            if self.token(0) == ",":
+            if self.token(0) == ',':
                 self.token()  # swallow ','
         self.token()  # swallow ']'
-        self.assertToken(self.token(), ":")
+        self.assertToken(self.token(), ':')
         return indices
 
     def parse_drs(self):
         # drs([[1001]:_G3943],
         #    [[1002]:pred(_G3943, dog, n, 0)]
         #   )
-        self.assertToken(self.token(), "(")
-        self.assertToken(self.token(), "[")
+        self.assertToken(self.token(), '(')
+        self.assertToken(self.token(), '[')
         refs = set()
-        while self.token(0) != "]":
+        while self.token(0) != ']':
             indices = self._parse_index_list()
             refs.add(self.parse_variable())
-            if self.token(0) == ",":
+            if self.token(0) == ',':
                 self.token()  # swallow ','
         self.token()  # swallow ']'
-        self.assertToken(self.token(), ",")
-        self.assertToken(self.token(), "[")
+        self.assertToken(self.token(), ',')
+        self.assertToken(self.token(), '[')
         conds = []
-        while self.token(0) != "]":
+        while self.token(0) != ']':
             indices = self._parse_index_list()
             conds.extend(self.parse_condition(indices))
-            if self.token(0) == ",":
+            if self.token(0) == ',':
                 self.token()  # swallow ','
         self.token()  # swallow ']'
-        self.assertToken(self.token(), ")")
+        self.assertToken(self.token(), ')')
         return BoxerDrs(list(refs), conds)
 
     def _handle_binary_expression(self, make_callback):
-        self.assertToken(self.token(), "(")
+        self.assertToken(self.token(), '(')
         drs1 = self.process_next_expression(None)
-        self.assertToken(self.token(), ",")
+        self.assertToken(self.token(), ',')
         drs2 = self.process_next_expression(None)
-        self.assertToken(self.token(), ")")
+        self.assertToken(self.token(), ')')
         return lambda sent_index, word_indices: make_callback(
             sent_index, word_indices, drs1, drs2
         )
 
     def _handle_alfa(self, make_callback):
-        self.assertToken(self.token(), "(")
+        self.assertToken(self.token(), '(')
         type = self.token()
-        self.assertToken(self.token(), ",")
+        self.assertToken(self.token(), ',')
         drs1 = self.process_next_expression(None)
-        self.assertToken(self.token(), ",")
+        self.assertToken(self.token(), ',')
         drs2 = self.process_next_expression(None)
-        self.assertToken(self.token(), ")")
+        self.assertToken(self.token(), ')')
         return lambda sent_index, word_indices: make_callback(
             sent_index, word_indices, drs1, drs2
         )
 
     def _handle_eq(self):
-        self.assertToken(self.token(), "(")
+        self.assertToken(self.token(), '(')
         var1 = self.parse_variable()
-        self.assertToken(self.token(), ",")
+        self.assertToken(self.token(), ',')
         var2 = self.parse_variable()
-        self.assertToken(self.token(), ")")
+        self.assertToken(self.token(), ')')
         return lambda sent_index, word_indices: BoxerEq(
             self.discourse_id, sent_index, word_indices, var1, var2
         )
 
     def _handle_whq(self):
-        self.assertToken(self.token(), "(")
-        self.assertToken(self.token(), "[")
+        self.assertToken(self.token(), '(')
+        self.assertToken(self.token(), '[')
         ans_types = []
-        while self.token(0) != "]":
+        while self.token(0) != ']':
             cat = self.token()
-            self.assertToken(self.token(), ":")
-            if cat == "des":
+            self.assertToken(self.token(), ':')
+            if cat == 'des':
                 ans_types.append(self.token())
-            elif cat == "num":
-                ans_types.append("number")
+            elif cat == 'num':
+                ans_types.append('number')
                 typ = self.token()
-                if typ == "cou":
-                    ans_types.append("count")
+                if typ == 'cou':
+                    ans_types.append('count')
                 else:
                     ans_types.append(typ)
             else:
                 ans_types.append(self.token())
         self.token()  # swallow the ']'
 
-        self.assertToken(self.token(), ",")
+        self.assertToken(self.token(), ',')
         d1 = self.process_next_expression(None)
-        self.assertToken(self.token(), ",")
+        self.assertToken(self.token(), ',')
         ref = self.parse_variable()
-        self.assertToken(self.token(), ",")
+        self.assertToken(self.token(), ',')
         d2 = self.process_next_expression(None)
-        self.assertToken(self.token(), ")")
+        self.assertToken(self.token(), ')')
         return lambda sent_index, word_indices: BoxerWhq(
             self.discourse_id, sent_index, word_indices, ans_types, d1, ref, d2
         )
@@ -773,7 +776,7 @@ class BoxerOutputDrsParser(DrtParser):
 
     def parse_variable(self):
         var = self.token()
-        assert re.match("^[exps]\d+$", var), var
+        assert re.match('^[exps]\d+$', var), var
         return var
 
     def parse_index(self):
@@ -829,7 +832,7 @@ class BoxerDrsParser(DrtParser):
             #                 conds = self.handle_conds(None)
             #                 self.assertNextToken(DrtTokens.CLOSE)
             #                 return BoxerDrs(label, refs, conds)
-            if tok == "pred":
+            if tok == 'pred':
                 self.assertNextToken(DrtTokens.OPEN)
                 disc_id = (
                     self.discourse_id if self.discourse_id is not None else self.token()
@@ -848,7 +851,7 @@ class BoxerDrsParser(DrtParser):
                 sense = int(self.token())
                 self.assertNextToken(DrtTokens.CLOSE)
                 return BoxerPred(disc_id, sent_id, word_ids, variable, name, pos, sense)
-            elif tok == "named":
+            elif tok == 'named':
                 self.assertNextToken(DrtTokens.OPEN)
                 disc_id = (
                     self.discourse_id if self.discourse_id is not None else self.token()
@@ -869,7 +872,7 @@ class BoxerDrsParser(DrtParser):
                 return BoxerNamed(
                     disc_id, sent_id, word_ids, variable, name, type, sense
                 )
-            elif tok == "rel":
+            elif tok == 'rel':
                 self.assertNextToken(DrtTokens.OPEN)
                 disc_id = (
                     self.discourse_id if self.discourse_id is not None else self.token()
@@ -888,7 +891,7 @@ class BoxerDrsParser(DrtParser):
                 sense = int(self.token())
                 self.assertNextToken(DrtTokens.CLOSE)
                 return BoxerRel(disc_id, sent_id, word_ids, var1, var2, rel, sense)
-            elif tok == "prop":
+            elif tok == 'prop':
                 self.assertNextToken(DrtTokens.OPEN)
                 disc_id = (
                     self.discourse_id if self.discourse_id is not None else self.token()
@@ -903,19 +906,19 @@ class BoxerDrsParser(DrtParser):
                 drs = self.process_next_expression(None)
                 self.assertNextToken(DrtTokens.CLOSE)
                 return BoxerProp(disc_id, sent_id, word_ids, variable, drs)
-            elif tok == "not":
+            elif tok == 'not':
                 self.assertNextToken(DrtTokens.OPEN)
                 drs = self.process_next_expression(None)
                 self.assertNextToken(DrtTokens.CLOSE)
                 return BoxerNot(drs)
-            elif tok == "imp":
+            elif tok == 'imp':
                 self.assertNextToken(DrtTokens.OPEN)
                 drs1 = self.process_next_expression(None)
                 self.assertNextToken(DrtTokens.COMMA)
                 drs2 = self.process_next_expression(None)
                 self.assertNextToken(DrtTokens.CLOSE)
                 return BoxerDrs(drs1.refs, drs1.conds, drs2)
-            elif tok == "or":
+            elif tok == 'or':
                 self.assertNextToken(DrtTokens.OPEN)
                 disc_id = (
                     self.discourse_id if self.discourse_id is not None else self.token()
@@ -930,7 +933,7 @@ class BoxerDrsParser(DrtParser):
                 drs2 = self.process_next_expression(None)
                 self.assertNextToken(DrtTokens.CLOSE)
                 return BoxerOr(disc_id, sent_id, word_ids, drs1, drs2)
-            elif tok == "eq":
+            elif tok == 'eq':
                 self.assertNextToken(DrtTokens.OPEN)
                 disc_id = (
                     self.discourse_id if self.discourse_id is not None else self.token()
@@ -945,7 +948,7 @@ class BoxerDrsParser(DrtParser):
                 var2 = int(self.token())
                 self.assertNextToken(DrtTokens.CLOSE)
                 return BoxerEq(disc_id, sent_id, word_ids, var1, var2)
-            elif tok == "card":
+            elif tok == 'card':
                 self.assertNextToken(DrtTokens.OPEN)
                 disc_id = (
                     self.discourse_id if self.discourse_id is not None else self.token()
@@ -962,7 +965,7 @@ class BoxerDrsParser(DrtParser):
                 type = self.token()
                 self.assertNextToken(DrtTokens.CLOSE)
                 return BoxerCard(disc_id, sent_id, word_ids, var, value, type)
-            elif tok == "whq":
+            elif tok == 'whq':
                 self.assertNextToken(DrtTokens.OPEN)
                 disc_id = (
                     self.discourse_id if self.discourse_id is not None else self.token()
@@ -987,13 +990,13 @@ class BoxerDrsParser(DrtParser):
 
     def nullableIntToken(self):
         t = self.token()
-        return int(t) if t != "None" else None
+        return int(t) if t != 'None' else None
 
     def get_next_token_variable(self, description):
         try:
             return self.token()
         except ExpectedMoreTokensException as e:
-            raise ExpectedMoreTokensException(e.index, "Variable expected.")
+            raise ExpectedMoreTokensException(e.index, 'Variable expected.')
 
 
 class AbstractBoxerDrs(object):
@@ -1006,7 +1009,7 @@ class AbstractBoxerDrs(object):
 
     def variable_types(self):
         vartypes = {}
-        for t, vars in zip(("z", "e", "p"), self.variables()):
+        for t, vars in zip(('z', 'e', 'p'), self.variables()):
             for v in vars:
                 vartypes[v] = t
         return vartypes
@@ -1024,7 +1027,7 @@ class AbstractBoxerDrs(object):
         return self
 
     def _clean_name(self, name):
-        return name.replace("-", "_").replace("'", "_")
+        return name.replace('-', '_').replace("'", "_")
 
     def renumber_sentences(self, f):
         return self
@@ -1033,6 +1036,7 @@ class AbstractBoxerDrs(object):
         return hash("{0}".format(self))
 
 
+@python_2_unicode_compatible
 class BoxerDrs(AbstractBoxerDrs):
     def __init__(self, refs, conds, consequent=None):
         AbstractBoxerDrs.__init__(self)
@@ -1067,12 +1071,12 @@ class BoxerDrs(AbstractBoxerDrs):
         )
 
     def __repr__(self):
-        s = "drs([%s], [%s])" % (
-            ", ".join("%s" % r for r in self.refs),
-            ", ".join("%s" % c for c in self.conds),
+        s = 'drs([%s], [%s])' % (
+            ', '.join("%s" % r for r in self.refs),
+            ', '.join("%s" % c for c in self.conds),
         )
         if self.consequent is not None:
-            s = "imp(%s, %s)" % (s, self.consequent)
+            s = 'imp(%s, %s)' % (s, self.consequent)
         return s
 
     def __eq__(self, other):
@@ -1092,6 +1096,7 @@ class BoxerDrs(AbstractBoxerDrs):
     __hash__ = AbstractBoxerDrs.__hash__
 
 
+@python_2_unicode_compatible
 class BoxerNot(AbstractBoxerDrs):
     def __init__(self, drs):
         AbstractBoxerDrs.__init__(self)
@@ -1110,7 +1115,7 @@ class BoxerNot(AbstractBoxerDrs):
         return BoxerNot(self.drs.renumber_sentences(f))
 
     def __repr__(self):
-        return "not(%s)" % (self.drs)
+        return 'not(%s)' % (self.drs)
 
     def __eq__(self, other):
         return self.__class__ == other.__class__ and self.drs == other.drs
@@ -1121,6 +1126,7 @@ class BoxerNot(AbstractBoxerDrs):
     __hash__ = AbstractBoxerDrs.__hash__
 
 
+@python_2_unicode_compatible
 class BoxerIndexed(AbstractBoxerDrs):
     def __init__(self, discourse_id, sent_index, word_indices):
         AbstractBoxerDrs.__init__(self)
@@ -1146,15 +1152,15 @@ class BoxerIndexed(AbstractBoxerDrs):
     __hash__ = AbstractBoxerDrs.__hash__
 
     def __repr__(self):
-        s = "%s(%s, %s, [%s]" % (
+        s = '%s(%s, %s, [%s]' % (
             self._pred(),
             self.discourse_id,
             self.sent_index,
-            ", ".join("%s" % wi for wi in self.word_indices),
+            ', '.join("%s" % wi for wi in self.word_indices),
         )
         for v in self:
-            s += ", %s" % v
-        return s + ")"
+            s += ', %s' % v
+        return s + ')'
 
 
 class BoxerPred(BoxerIndexed):
@@ -1206,7 +1212,7 @@ class BoxerPred(BoxerIndexed):
         return iter((self.var, self.name, self.pos, self.sense))
 
     def _pred(self):
-        return "pred"
+        return 'pred'
 
 
 class BoxerNamed(BoxerIndexed):
@@ -1257,7 +1263,7 @@ class BoxerNamed(BoxerIndexed):
         return iter((self.var, self.name, self.type, self.sense))
 
     def _pred(self):
-        return "named"
+        return 'named'
 
 
 class BoxerRel(BoxerIndexed):
@@ -1297,7 +1303,7 @@ class BoxerRel(BoxerIndexed):
         return iter((self.var1, self.var2, self.rel, self.sense))
 
     def _pred(self):
-        return "rel"
+        return 'rel'
 
 
 class BoxerProp(BoxerIndexed):
@@ -1339,7 +1345,7 @@ class BoxerProp(BoxerIndexed):
         return iter((self.var, self.drs))
 
     def _pred(self):
-        return "prop"
+        return 'prop'
 
 
 class BoxerEq(BoxerIndexed):
@@ -1367,7 +1373,7 @@ class BoxerEq(BoxerIndexed):
         return iter((self.var1, self.var2))
 
     def _pred(self):
-        return "eq"
+        return 'eq'
 
 
 class BoxerCard(BoxerIndexed):
@@ -1394,7 +1400,7 @@ class BoxerCard(BoxerIndexed):
         return iter((self.var, self.value, self.type))
 
     def _pred(self):
-        return "card"
+        return 'card'
 
 
 class BoxerOr(BoxerIndexed):
@@ -1431,7 +1437,7 @@ class BoxerOr(BoxerIndexed):
         return iter((self.drs1, self.drs2))
 
     def _pred(self):
-        return "or"
+        return 'or'
 
 
 class BoxerWhq(BoxerIndexed):
@@ -1481,11 +1487,11 @@ class BoxerWhq(BoxerIndexed):
 
     def __iter__(self):
         return iter(
-            ("[" + ",".join(self.ans_types) + "]", self.drs1, self.variable, self.drs2)
+            ('[' + ','.join(self.ans_types) + ']', self.drs1, self.variable, self.drs2)
         )
 
     def _pred(self):
-        return "whq"
+        return 'whq'
 
 
 class PassthroughBoxerDrsInterpreter(object):
@@ -1512,13 +1518,13 @@ class NltkDrtBoxerDrsInterpreter(object):
         elif isinstance(ex, BoxerNot):
             return DrtNegatedExpression(self.interpret(ex.drs))
         elif isinstance(ex, BoxerPred):
-            pred = self._add_occur_indexing("%s_%s" % (ex.pos, ex.name), ex)
+            pred = self._add_occur_indexing('%s_%s' % (ex.pos, ex.name), ex)
             return self._make_atom(pred, ex.var)
         elif isinstance(ex, BoxerNamed):
-            pred = self._add_occur_indexing("ne_%s_%s" % (ex.type, ex.name), ex)
+            pred = self._add_occur_indexing('ne_%s_%s' % (ex.type, ex.name), ex)
             return self._make_atom(pred, ex.var)
         elif isinstance(ex, BoxerRel):
-            pred = self._add_occur_indexing("%s" % (ex.rel), ex)
+            pred = self._add_occur_indexing('%s' % (ex.rel), ex)
             return self._make_atom(pred, ex.var1, ex.var2)
         elif isinstance(ex, BoxerProp):
             return DrtProposition(Variable(ex.var), self.interpret(ex.drs))
@@ -1528,7 +1534,7 @@ class NltkDrtBoxerDrsInterpreter(object):
                 DrtVariableExpression(Variable(ex.var2)),
             )
         elif isinstance(ex, BoxerCard):
-            pred = self._add_occur_indexing("card_%s_%s" % (ex.type, ex.value), ex)
+            pred = self._add_occur_indexing('card_%s_%s' % (ex.type, ex.value), ex)
             return self._make_atom(pred, ex.var)
         elif isinstance(ex, BoxerOr):
             return DrtOrExpression(self.interpret(ex.drs1), self.interpret(ex.drs2))
@@ -1536,7 +1542,7 @@ class NltkDrtBoxerDrsInterpreter(object):
             drs1 = self.interpret(ex.drs1)
             drs2 = self.interpret(ex.drs2)
             return DRS(drs1.refs + drs2.refs, drs1.conds + drs2.conds)
-        assert False, "%s: %s" % (ex.__class__.__name__, ex)
+        assert False, '%s: %s' % (ex.__class__.__name__, ex)
 
     def _make_atom(self, pred, *args):
         accum = DrtVariableExpression(Variable(pred))
@@ -1549,9 +1555,9 @@ class NltkDrtBoxerDrsInterpreter(object):
     def _add_occur_indexing(self, base, ex):
         if self._occur_index and ex.sent_index is not None:
             if ex.discourse_id:
-                base += "_%s" % ex.discourse_id
-            base += "_s%s" % ex.sent_index
-            base += "_w%s" % sorted(ex.word_indices)[0]
+                base += '_%s' % ex.discourse_id
+            base += '_s%s' % ex.sent_index
+            base += '_w%s' % sorted(ex.word_indices)[0]
         return base
 
 
@@ -1559,7 +1565,7 @@ class UnparseableInputException(Exception):
     pass
 
 
-if __name__ == "__main__":
+if __name__ == '__main__':
     opts = OptionParser("usage: %prog TEXT [options]")
     opts.add_option(
         "--verbose",
@@ -1595,7 +1601,7 @@ if __name__ == "__main__":
 
     interpreter = NltkDrtBoxerDrsInterpreter(occur_index=options.occur_index)
     drs = Boxer(interpreter).interpret_multi(
-        args[0].split(r"\n"), question=options.question, verbose=options.verbose
+        args[0].split(r'\n'), question=options.question, verbose=options.verbose
     )
     if drs is None:
         print(None)
index 2597177..9500b35 100644 (file)
@@ -1,7 +1,7 @@
 # Natural Language Toolkit: Chat-80 KB Reader
 # See http://www.w3.org/TR/swbp-skos-core-guide/
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Ewan Klein <ewan@inf.ed.ac.uk>,
 # URL: <http://nltk.sourceforge.net>
 # For license information, see LICENSE.TXT
@@ -122,127 +122,132 @@ The set of rules is written to the file ``chat_pnames.cfg`` in the
 current directory.
 
 """
+from __future__ import print_function, unicode_literals
 
 import re
 import shelve
 import os
 import sys
 
+from six import string_types
+
 import nltk.data
+from nltk.compat import python_2_unicode_compatible
 
 ###########################################################################
 # Chat-80 relation metadata bundles needed to build the valuation
 ###########################################################################
 
 borders = {
-    "rel_name": "borders",
-    "closures": ["symmetric"],
-    "schema": ["region", "border"],
-    "filename": "borders.pl",
+    'rel_name': 'borders',
+    'closures': ['symmetric'],
+    'schema': ['region', 'border'],
+    'filename': 'borders.pl',
 }
 
 contains = {
-    "rel_name": "contains0",
-    "closures": ["transitive"],
-    "schema": ["region", "contain"],
-    "filename": "contain.pl",
+    'rel_name': 'contains0',
+    'closures': ['transitive'],
+    'schema': ['region', 'contain'],
+    'filename': 'contain.pl',
 }
 
 city = {
-    "rel_name": "city",
-    "closures": [],
-    "schema": ["city", "country", "population"],
-    "filename": "cities.pl",
+    'rel_name': 'city',
+    'closures': [],
+    'schema': ['city', 'country', 'population'],
+    'filename': 'cities.pl',
 }
 
 country = {
-    "rel_name": "country",
-    "closures": [],
-    "schema": [
-        "country",
-        "region",
-        "latitude",
-        "longitude",
-        "area",
-        "population",
-        "capital",
-        "currency",
+    'rel_name': 'country',
+    'closures': [],
+    'schema': [
+        'country',
+        'region',
+        'latitude',
+        'longitude',
+        'area',
+        'population',
+        'capital',
+        'currency',
     ],
-    "filename": "countries.pl",
+    'filename': 'countries.pl',
 }
 
 circle_of_lat = {
-    "rel_name": "circle_of_latitude",
-    "closures": [],
-    "schema": ["circle_of_latitude", "degrees"],
-    "filename": "world1.pl",
+    'rel_name': 'circle_of_latitude',
+    'closures': [],
+    'schema': ['circle_of_latitude', 'degrees'],
+    'filename': 'world1.pl',
 }
 
 circle_of_long = {
-    "rel_name": "circle_of_longitude",
-    "closures": [],
-    "schema": ["circle_of_longitude", "degrees"],
-    "filename": "world1.pl",
+    'rel_name': 'circle_of_longitude',
+    'closures': [],
+    'schema': ['circle_of_longitude', 'degrees'],
+    'filename': 'world1.pl',
 }
 
 continent = {
-    "rel_name": "continent",
-    "closures": [],
-    "schema": ["continent"],
-    "filename": "world1.pl",
+    'rel_name': 'continent',
+    'closures': [],
+    'schema': ['continent'],
+    'filename': 'world1.pl',
 }
 
 region = {
-    "rel_name": "in_continent",
-    "closures": [],
-    "schema": ["region", "continent"],
-    "filename": "world1.pl",
+    'rel_name': 'in_continent',
+    'closures': [],
+    'schema': ['region', 'continent'],
+    'filename': 'world1.pl',
 }
 
 ocean = {
-    "rel_name": "ocean",
-    "closures": [],
-    "schema": ["ocean"],
-    "filename": "world1.pl",
+    'rel_name': 'ocean',
+    'closures': [],
+    'schema': ['ocean'],
+    'filename': 'world1.pl',
 }
 
-sea = {"rel_name": "sea", "closures": [], "schema": ["sea"], "filename": "world1.pl"}
+sea = {'rel_name': 'sea', 'closures': [], 'schema': ['sea'], 'filename': 'world1.pl'}
 
 
 items = [
-    "borders",
-    "contains",
-    "city",
-    "country",
-    "circle_of_lat",
-    "circle_of_long",
-    "continent",
-    "region",
-    "ocean",
-    "sea",
+    'borders',
+    'contains',
+    'city',
+    'country',
+    'circle_of_lat',
+    'circle_of_long',
+    'continent',
+    'region',
+    'ocean',
+    'sea',
 ]
 items = tuple(sorted(items))
 
 item_metadata = {
-    "borders": borders,
-    "contains": contains,
-    "city": city,
-    "country": country,
-    "circle_of_lat": circle_of_lat,
-    "circle_of_long": circle_of_long,
-    "continent": continent,
-    "region": region,
-    "ocean": ocean,
-    "sea": sea,
+    'borders': borders,
+    'contains': contains,
+    'city': city,
+    'country': country,
+    'circle_of_lat': circle_of_lat,
+    'circle_of_long': circle_of_long,
+    'continent': continent,
+    'region': region,
+    'ocean': ocean,
+    'sea': sea,
 }
 
 rels = item_metadata.values()
 
-not_unary = ["borders.pl", "contain.pl"]
+not_unary = ['borders.pl', 'contain.pl']
 
 ###########################################################################
 
 
+@python_2_unicode_compatible
 class Concept(object):
     """
     A Concept class, loosely based on SKOS
@@ -347,16 +352,17 @@ class Concept(object):
         from nltk.sem import is_rel
 
         assert is_rel(self._extension)
-        if "symmetric" in self.closures:
+        if 'symmetric' in self.closures:
             pairs = []
             for (x, y) in self._extension:
                 pairs.append((y, x))
             sym = set(pairs)
             self._extension = self._extension.union(sym)
-        if "transitive" in self.closures:
+        if 'transitive' in self.closures:
             all = self._make_graph(self._extension)
             closed = self._transclose(all)
             trans = self._make_pairs(closed)
+            # print sorted(trans)
             self._extension = self._extension.union(trans)
         self.extension = sorted(list(self._extension))
 
@@ -426,13 +432,13 @@ def cities2table(filename, rel_name, dbname, verbose=False, setup=False):
     cur = connection.cursor()
     if setup:
         cur.execute(
-            """CREATE TABLE city_table
-        (City text, Country text, Population int)"""
+            '''CREATE TABLE city_table
+        (City text, Country text, Population int)'''
         )
 
     table_name = "city_table"
     for t in records:
-        cur.execute("insert into %s values (?,?,?)" % table_name, t)
+        cur.execute('insert into %s values (?,?,?)' % table_name, t)
         if verbose:
             print("inserting values into %s: " % table_name, t)
     connection.commit()
@@ -473,9 +479,9 @@ def _str2records(filename, rel):
     contents = nltk.data.load("corpora/chat80/%s" % filename, format="text")
     for line in contents.splitlines():
         if line.startswith(rel):
-            line = re.sub(rel + r"\(", "", line)
-            line = re.sub(r"\)\.$", "", line)
-            record = line.split(",")
+            line = re.sub(rel + r'\(', '', line)
+            line = re.sub(r'\)\.$', '', line)
+            record = line.split(',')
             recs.append(record)
     return recs
 
@@ -531,8 +537,8 @@ def binary_concept(label, closures, subj, obj, records):
     :return: ``Concept`` of arity 2
     :rtype: Concept
     """
-    if not label == "border" and not label == "contain":
-        label = label + "_of"
+    if not label == 'border' and not label == 'contain':
+        label = label + '_of'
     c = Concept(label, arity=2, closures=closures, extension=set())
     for record in records:
         c.augment((record[subj], record[obj]))
@@ -553,10 +559,10 @@ def process_bundle(rels):
     """
     concepts = {}
     for rel in rels:
-        rel_name = rel["rel_name"]
-        closures = rel["closures"]
-        schema = rel["schema"]
-        filename = rel["filename"]
+        rel_name = rel['rel_name']
+        closures = rel['closures']
+        schema = rel['schema']
+        filename = rel['filename']
 
         concept_list = clause2concepts(filename, rel_name, schema, closures)
         for c in concept_list:
@@ -612,7 +618,7 @@ def val_dump(rels, db):
     """
     concepts = process_bundle(rels).values()
     valuation = make_valuation(concepts, read=True)
-    db_out = shelve.open(db, "n")
+    db_out = shelve.open(db, 'n')
 
     db_out.update(valuation)
 
@@ -674,7 +680,7 @@ def label_indivs(valuation, lexicon=False):
     pairs = [(e, e) for e in domain]
     if lexicon:
         lex = make_lex(domain)
-        with open("chat_pnames.cfg", "w") as outfile:
+        with open("chat_pnames.cfg", 'w') as outfile:
             outfile.writelines(lex)
     # read the pairs into the valuation
     valuation.update(pairs)
@@ -703,9 +709,9 @@ def make_lex(symbols):
     template = "PropN[num=sg, sem=<\P.(P %s)>] -> '%s'\n"
 
     for s in symbols:
-        parts = s.split("_")
+        parts = s.split('_')
         caps = [p.capitalize() for p in parts]
-        pname = "_".join(caps)
+        pname = '_'.join(caps)
         rule = template % (s, pname)
         lex.append(rule)
     return lex
@@ -725,7 +731,7 @@ def concepts(items=items):
     :return: the ``Concept`` objects which are extracted from the relations
     :rtype: list(Concept)
     """
-    if isinstance(items, str):
+    if isinstance(items, string_types):
         items = (items,)
 
     rels = [item_metadata[r] for r in items]
@@ -848,10 +854,10 @@ def sql_demo():
     """
     print()
     print("Using SQL to extract rows from 'city.db' RDB.")
-    for row in sql_query("corpora/city_database/city.db", "SELECT * FROM city_table"):
+    for row in sql_query('corpora/city_database/city.db', "SELECT * FROM city_table"):
         print(row)
 
 
-if __name__ == "__main__":
+if __name__ == '__main__':
     main()
     sql_demo()
index 830c3e4..4aca110 100644 (file)
@@ -1,9 +1,10 @@
 # Natural Language Toolkit: Cooper storage for Quantifier Ambiguity
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Ewan Klein <ewan@inf.ed.ac.uk>
 # URL: <http://nltk.org/>
 # For license information, see LICENSE.TXT
+from __future__ import print_function
 
 from nltk.sem.logic import LambdaExpression, ApplicationExpression, Variable
 from nltk.parse import load_parser
@@ -25,8 +26,8 @@ class CooperStore(object):
         self.featstruct = featstruct
         self.readings = []
         try:
-            self.core = featstruct["CORE"]
-            self.store = featstruct["STORE"]
+            self.core = featstruct['CORE']
+            self.store = featstruct['STORE']
         except KeyError:
             print("%s is not a Cooper storage structure" % featstruct)
 
@@ -82,7 +83,7 @@ def parse_with_bindops(sentence, grammar=None, trace=0):
     Use a grammar with Binding Operators to parse a sentence.
     """
     if not grammar:
-        grammar = "grammars/book_grammars/storage.fcfg"
+        grammar = 'grammars/book_grammars/storage.fcfg'
     parser = load_parser(grammar, trace=trace, chart_class=InstantiateVarsChart)
     # Parse the sentence.
     tokens = sentence.split()
@@ -99,7 +100,7 @@ def demo():
     print("=" * 50)
     trees = cs.parse_with_bindops(sentence, trace=0)
     for tree in trees:
-        semrep = cs.CooperStore(tree.label()["SEM"])
+        semrep = cs.CooperStore(tree.label()['SEM'])
         print()
         print("Binding operators:")
         print("-" * 15)
@@ -120,5 +121,5 @@ def demo():
             print("%s: %s" % (i + 1, reading))
 
 
-if __name__ == "__main__":
+if __name__ == '__main__':
     demo()
index 57e26fb..8bc67f6 100644 (file)
@@ -2,14 +2,18 @@
 #
 # Author: Dan Garrette <dhgarrette@gmail.com>
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # URL: <http://nltk.org/>
 # For license information, see LICENSE.TXT
+from __future__ import print_function, unicode_literals
 
 import operator
 from functools import reduce
 from itertools import chain
 
+from six import string_types
+
+from nltk.compat import python_2_unicode_compatible
 from nltk.sem.logic import (
     APP,
     AbstractVariableExpression,
@@ -40,8 +44,8 @@ from nltk.sem.logic import (
 
 # Import Tkinter-based modules if they are available
 try:
-    from tkinter import Canvas, Tk
-    from tkinter.font import Font
+    from six.moves.tkinter import Canvas, Tk
+    from six.moves.tkinter_font import Font
     from nltk.util import in_idle
 
 except ImportError:
@@ -50,12 +54,12 @@ except ImportError:
 
 
 class DrtTokens(Tokens):
-    DRS = "DRS"
-    DRS_CONC = "+"
-    PRONOUN = "PRO"
-    OPEN_BRACKET = "["
-    CLOSE_BRACKET = "]"
-    COLON = ":"
+    DRS = 'DRS'
+    DRS_CONC = '+'
+    PRONOUN = 'PRO'
+    OPEN_BRACKET = '['
+    CLOSE_BRACKET = ']'
+    COLON = ':'
 
     PUNCT = [DRS_CONC, OPEN_BRACKET, CLOSE_BRACKET, COLON]
 
@@ -135,7 +139,7 @@ class DrtParser(LogicParser):
             # Support expressions like: DRS([x y],C) == DRS([x,y],C)
             if refs and self.token(0) == DrtTokens.COMMA:
                 self.token()  # swallow the comma
-            refs.append(self.get_next_token_variable("quantified"))
+            refs.append(self.get_next_token_variable('quantified'))
         self.assertNextToken(DrtTokens.CLOSE_BRACKET)
         return refs
 
@@ -152,7 +156,7 @@ class DrtParser(LogicParser):
 
     def handle_prop(self, tok, context):
         variable = self.make_VariableExpression(tok)
-        self.assertNextToken(":")
+        self.assertNextToken(':')
         drs = self.process_next_expression(DrtTokens.COLON)
         return DrtProposition(variable, drs)
 
@@ -175,7 +179,7 @@ class DrtParser(LogicParser):
                     return DRS(first.refs, first.conds, second)
                 if isinstance(first, DrtConcatenation):
                     return DrtConcatenation(first.first, first.second, second)
-                raise Exception("Antecedent of implication must be a DRS")
+                raise Exception('Antecedent of implication must be a DRS')
 
             return make_imp_expression
         else:
@@ -225,7 +229,7 @@ class DrtExpression(object):
             return DRS(self.refs, self.conds, other)
         if isinstance(self, DrtConcatenation):
             return DrtConcatenation(self.first, self.second, other)
-        raise Exception("Antecedent of implication must be a DRS")
+        raise Exception('Antecedent of implication must be a DRS')
 
     def equiv(self, other, prover=None):
         """
@@ -288,7 +292,7 @@ class DrtExpression(object):
         Draw the DRS
         :return: the pretty print string
         """
-        return "\n".join(self._pretty())
+        return '\n'.join(self._pretty())
 
     def pretty_print(self):
         print(self.pretty_format())
@@ -297,6 +301,7 @@ class DrtExpression(object):
         DrsDrawer(self).draw()
 
 
+@python_2_unicode_compatible
 class DRS(DrtExpression, Expression):
     """A Discourse Representation Structure."""
 
@@ -464,7 +469,7 @@ class DRS(DrtExpression, Expression):
             return accum
 
     def _pretty(self):
-        refs_line = " ".join(self._order_ref_strings(self.refs))
+        refs_line = ' '.join(self._order_ref_strings(self.refs))
 
         cond_lines = [
             cond
@@ -476,12 +481,12 @@ class DRS(DrtExpression, Expression):
         length = max([len(refs_line)] + list(map(len, cond_lines)))
         drs = (
             [
-                " _" + "_" * length + "_ ",
-                "| " + refs_line.ljust(length) + " |",
-                "|-" + "-" * length + "-|",
+                ' _' + '_' * length + '_ ',
+                '| ' + refs_line.ljust(length) + ' |',
+                '|-' + '-' * length + '-|',
             ]
-            + ["| " + line.ljust(length) + " |" for line in cond_lines]
-            + ["|_" + "_" * length + "_|"]
+            + ['| ' + line.ljust(length) + ' |' for line in cond_lines]
+            + ['|_' + '_' * length + '_|']
         )
         if self.consequent:
             return DrtBinaryExpression._assemble_pretty(
@@ -535,17 +540,17 @@ class DRS(DrtExpression, Expression):
     __hash__ = Expression.__hash__
 
     def __str__(self):
-        drs = "([%s],[%s])" % (
-            ",".join(self._order_ref_strings(self.refs)),
-            ", ".join("%s" % cond for cond in self.conds),
+        drs = '([%s],[%s])' % (
+            ','.join(self._order_ref_strings(self.refs)),
+            ', '.join("%s" % cond for cond in self.conds),
         )  # map(str, self.conds)))
         if self.consequent:
             return (
                 DrtTokens.OPEN
                 + drs
-                + " "
+                + ' '
                 + DrtTokens.IMP
-                + " "
+                + ' '
                 + "%s" % self.consequent
                 + DrtTokens.CLOSE
             )
@@ -577,7 +582,7 @@ class DrtAbstractVariableExpression(DrtExpression, AbstractVariableExpression):
 
     def _pretty(self):
         s = "%s" % self
-        blank = " " * len(s)
+        blank = ' ' * len(s)
         return [blank, blank, s, blank]
 
     def eliminate_equality(self):
@@ -606,6 +611,7 @@ class DrtConstantExpression(DrtAbstractVariableExpression, ConstantExpression):
     pass
 
 
+@python_2_unicode_compatible
 class DrtProposition(DrtExpression, Expression):
     def __init__(self, variable, drs):
         self.variable = variable
@@ -649,11 +655,11 @@ class DrtProposition(DrtExpression, Expression):
 
     def _pretty(self):
         drs_s = self.drs._pretty()
-        blank = " " * len("%s" % self.variable)
+        blank = ' ' * len("%s" % self.variable)
         return (
-            [blank + " " + line for line in drs_s[:1]]
-            + ["%s" % self.variable + ":" + line for line in drs_s[1:2]]
-            + [blank + " " + line for line in drs_s[2:]]
+            [blank + ' ' + line for line in drs_s[:1]]
+            + ["%s" % self.variable + ':' + line for line in drs_s[1:2]]
+            + [blank + ' ' + line for line in drs_s[2:]]
         )
 
     def visit(self, function, combinator):
@@ -665,7 +671,7 @@ class DrtProposition(DrtExpression, Expression):
         return combinator(self.variable, function(self.drs))
 
     def __str__(self):
-        return "prop(%s, %s)" % (self.variable, self.drs)
+        return 'prop(%s, %s)' % (self.variable, self.drs)
 
 
 class DrtNegatedExpression(DrtExpression, NegatedExpression):
@@ -679,10 +685,10 @@ class DrtNegatedExpression(DrtExpression, NegatedExpression):
     def _pretty(self):
         term_lines = self.term._pretty()
         return (
-            ["    " + line for line in term_lines[:2]]
-            + ["__  " + line for line in term_lines[2:3]]
-            + ["  | " + line for line in term_lines[3:4]]
-            + ["    " + line for line in term_lines[4:]]
+            ['    ' + line for line in term_lines[:2]]
+            + ['__  ' + line for line in term_lines[2:3]]
+            + ['  | ' + line for line in term_lines[3:4]]
+            + ['    ' + line for line in term_lines[4:]]
         )
 
 
@@ -706,14 +712,14 @@ class DrtLambdaExpression(DrtExpression, LambdaExpression):
         while term.__class__ == self.__class__:
             variables.append(term.variable)
             term = term.term
-        var_string = " ".join("%s" % v for v in variables) + DrtTokens.DOT
+        var_string = ' '.join("%s" % v for v in variables) + DrtTokens.DOT
         term_lines = term._pretty()
-        blank = " " * len(var_string)
+        blank = ' ' * len(var_string)
         return (
-            ["    " + blank + line for line in term_lines[:1]]
-            + [" \  " + blank + line for line in term_lines[1:2]]
-            + [" /\ " + var_string + line for line in term_lines[2:3]]
-            + ["    " + blank + line for line in term_lines[3:]]
+            ['    ' + blank + line for line in term_lines[:1]]
+            + [' \  ' + blank + line for line in term_lines[1:2]]
+            + [' /\ ' + var_string + line for line in term_lines[2:3]]
+            + ['    ' + blank + line for line in term_lines[3:]]
         )
 
 
@@ -736,19 +742,19 @@ class DrtBinaryExpression(DrtExpression, BinaryExpression):
         max_lines = max(len(first_lines), len(second_lines))
         first_lines = _pad_vertically(first_lines, max_lines)
         second_lines = _pad_vertically(second_lines, max_lines)
-        blank = " " * len(op)
+        blank = ' ' * len(op)
         first_second_lines = list(zip(first_lines, second_lines))
         return (
             [
-                " " + first_line + " " + blank + " " + second_line + " "
+                ' ' + first_line + ' ' + blank + ' ' + second_line + ' '
                 for first_line, second_line in first_second_lines[:2]
             ]
             + [
-                "(" + first_line + " " + op + " " + second_line + ")"
+                '(' + first_line + ' ' + op + ' ' + second_line + ')'
                 for first_line, second_line in first_second_lines[2:3]
             ]
             + [
-                " " + first_line + " " + blank + " " + second_line + " "
+                ' ' + first_line + ' ' + blank + ' ' + second_line + ' '
                 for first_line, second_line in first_second_lines[3:]
             ]
         )
@@ -776,6 +782,7 @@ class DrtEqualityExpression(DrtBinaryExpression, EqualityExpression):
         return EqualityExpression(self.first.fol(), self.second.fol())
 
 
+@python_2_unicode_compatible
 class DrtConcatenation(DrtBooleanExpression):
     """DRS of the form '(DRS + DRS)'"""
 
@@ -912,14 +919,14 @@ class DrtConcatenation(DrtBooleanExpression):
     def __str__(self):
         first = self._str_subex(self.first)
         second = self._str_subex(self.second)
-        drs = Tokens.OPEN + first + " " + self.getOp() + " " + second + Tokens.CLOSE
+        drs = Tokens.OPEN + first + ' ' + self.getOp() + ' ' + second + Tokens.CLOSE
         if self.consequent:
             return (
                 DrtTokens.OPEN
                 + drs
-                + " "
+                + ' '
                 + DrtTokens.IMP
-                + " "
+                + ' '
                 + "%s" % self.consequent
                 + DrtTokens.CLOSE
             )
@@ -954,25 +961,26 @@ class DrtApplicationExpression(DrtExpression, ApplicationExpression):
         func_args_lines = list(zip(function_lines, list(zip(*args_lines))))
         return (
             [
-                func_line + " " + " ".join(args_line) + " "
+                func_line + ' ' + ' '.join(args_line) + ' '
                 for func_line, args_line in func_args_lines[:2]
             ]
             + [
-                func_line + "(" + ",".join(args_line) + ")"
+                func_line + '(' + ','.join(args_line) + ')'
                 for func_line, args_line in func_args_lines[2:3]
             ]
             + [
-                func_line + " " + " ".join(args_line) + " "
+                func_line + ' ' + ' '.join(args_line) + ' '
                 for func_line, args_line in func_args_lines[3:]
             ]
         )
 
 
 def _pad_vertically(lines, max_lines):
-    pad_line = [" " * len(lines[0])]
+    pad_line = [' ' * len(lines[0])]
     return lines + pad_line * (max_lines - len(lines))
 
 
+@python_2_unicode_compatible
 class PossibleAntecedents(list, DrtExpression, Expression):
     def free(self):
         """Set of free variables."""
@@ -991,11 +999,11 @@ class PossibleAntecedents(list, DrtExpression, Expression):
 
     def _pretty(self):
         s = "%s" % self
-        blank = " " * len(s)
+        blank = ' ' * len(s)
         return [blank, blank, s]
 
     def __str__(self):
-        return "[" + ",".join("%s" % it for it in self) + "]"
+        return '[' + ','.join("%s" % it for it in self) + ']'
 
 
 class AnaphoraResolutionException(Exception):
@@ -1101,7 +1109,7 @@ class DrsDrawer(object):
             master = Tk()
             master.title("DRT")
 
-            font = Font(family="helvetica", size=12)
+            font = Font(family='helvetica', size=12)
 
             if size_canvas:
                 canvas = Canvas(master, width=0, height=0)
@@ -1155,8 +1163,8 @@ class DrsDrawer(object):
         :param y: the left side of the current drawing area
         :return: the bottom-rightmost point
         """
-        if isinstance(item, str):
-            self.canvas.create_text(x, y, anchor="nw", font=self.canvas.font, text=item)
+        if isinstance(item, string_types):
+            self.canvas.create_text(x, y, anchor='nw', font=self.canvas.font, text=item)
         elif isinstance(item, tuple):
             # item is the lower-right of a box
             (right, bottom) = item
@@ -1177,7 +1185,7 @@ class DrsDrawer(object):
         :param y: the left side of the current drawing area
         :return: the bottom-rightmost point
         """
-        if isinstance(item, str):
+        if isinstance(item, string_types):
             return (x + self.canvas.font.measure(item), y + self._get_text_height())
         elif isinstance(item, tuple):
             return item
@@ -1253,9 +1261,9 @@ class DrsDrawer(object):
 
         # Handle Discourse Referents
         if expression.refs:
-            refs = " ".join("%s" % r for r in expression.refs)
+            refs = ' '.join("%s" % r for r in expression.refs)
         else:
-            refs = "     "
+            refs = '     '
         (max_right, bottom) = command(refs, left, bottom)
         bottom += self.BUFFER * 2
 
@@ -1308,7 +1316,7 @@ class DrsDrawer(object):
 
             if i + 1 < len(args):
                 # since it's not the last arg, add a comma
-                right = command(DrtTokens.COMMA + " ", right, centred_string_top)[0]
+                right = command(DrtTokens.COMMA + ' ', right, centred_string_top)[0]
 
         # Handle close paren
         right = command(DrtTokens.CLOSE, right, centred_string_top)[0]
@@ -1352,7 +1360,7 @@ class DrsDrawer(object):
         )
 
         # Handle the operator
-        right = command(" %s " % expression.getOp(), right, centred_string_top)[0]
+        right = command(' %s ' % expression.getOp(), right, centred_string_top)[0]
 
         # Handle the second operand
         second_height = expression.second._drawing_height
@@ -1385,36 +1393,36 @@ class DrsDrawer(object):
 
 
 def demo():
-    print("=" * 20 + "TEST PARSE" + "=" * 20)
+    print('=' * 20 + 'TEST PARSE' + '=' * 20)
     dexpr = DrtExpression.fromstring
-    print(dexpr(r"([x,y],[sees(x,y)])"))
-    print(dexpr(r"([x],[man(x), walks(x)])"))
-    print(dexpr(r"\x.\y.([],[sees(x,y)])"))
-    print(dexpr(r"\x.([],[walks(x)])(john)"))
-    print(dexpr(r"(([x],[walks(x)]) + ([y],[runs(y)]))"))
-    print(dexpr(r"(([],[walks(x)]) -> ([],[runs(x)]))"))
-    print(dexpr(r"([x],[PRO(x), sees(John,x)])"))
-    print(dexpr(r"([x],[man(x), -([],[walks(x)])])"))
-    print(dexpr(r"([],[(([x],[man(x)]) -> ([],[walks(x)]))])"))
-
-    print("=" * 20 + "Test fol()" + "=" * 20)
-    print(dexpr(r"([x,y],[sees(x,y)])").fol())
-
-    print("=" * 20 + "Test alpha conversion and lambda expression equality" + "=" * 20)
-    e1 = dexpr(r"\x.([],[P(x)])")
+    print(dexpr(r'([x,y],[sees(x,y)])'))
+    print(dexpr(r'([x],[man(x), walks(x)])'))
+    print(dexpr(r'\x.\y.([],[sees(x,y)])'))
+    print(dexpr(r'\x.([],[walks(x)])(john)'))
+    print(dexpr(r'(([x],[walks(x)]) + ([y],[runs(y)]))'))
+    print(dexpr(r'(([],[walks(x)]) -> ([],[runs(x)]))'))
+    print(dexpr(r'([x],[PRO(x), sees(John,x)])'))
+    print(dexpr(r'([x],[man(x), -([],[walks(x)])])'))
+    print(dexpr(r'([],[(([x],[man(x)]) -> ([],[walks(x)]))])'))
+
+    print('=' * 20 + 'Test fol()' + '=' * 20)
+    print(dexpr(r'([x,y],[sees(x,y)])').fol())
+
+    print('=' * 20 + 'Test alpha conversion and lambda expression equality' + '=' * 20)
+    e1 = dexpr(r'\x.([],[P(x)])')
     print(e1)
-    e2 = e1.alpha_convert(Variable("z"))
+    e2 = e1.alpha_convert(Variable('z'))
     print(e2)
     print(e1 == e2)
 
-    print("=" * 20 + "Test resolve_anaphora()" + "=" * 20)
-    print(resolve_anaphora(dexpr(r"([x,y,z],[dog(x), cat(y), walks(z), PRO(z)])")))
+    print('=' * 20 + 'Test resolve_anaphora()' + '=' * 20)
+    print(resolve_anaphora(dexpr(r'([x,y,z],[dog(x), cat(y), walks(z), PRO(z)])')))
     print(
-        resolve_anaphora(dexpr(r"([],[(([x],[dog(x)]) -> ([y],[walks(y), PRO(y)]))])"))
+        resolve_anaphora(dexpr(r'([],[(([x],[dog(x)]) -> ([y],[walks(y), PRO(y)]))])'))
     )
-    print(resolve_anaphora(dexpr(r"(([x,y],[]) + ([],[PRO(x)]))")))
+    print(resolve_anaphora(dexpr(r'(([x,y],[]) + ([],[PRO(x)]))')))
 
-    print("=" * 20 + "Test pretty_print()" + "=" * 20)
+    print('=' * 20 + 'Test pretty_print()' + '=' * 20)
     dexpr(r"([],[])").pretty_print()
     dexpr(
         r"([],[([x],[big(x), dog(x)]) -> ([],[bark(x)]) -([x],[walk(x)])])"
@@ -1426,24 +1434,24 @@ def demo():
 
 def test_draw():
     try:
-        from tkinter import Tk
+        from six.moves.tkinter import Tk
     except ImportError:
         from nose import SkipTest
 
         raise SkipTest("tkinter is required, but it's not available.")
 
     expressions = [
-        r"x",
-        r"([],[])",
-        r"([x],[])",
-        r"([x],[man(x)])",
-        r"([x,y],[sees(x,y)])",
-        r"([x],[man(x), walks(x)])",
-        r"\x.([],[man(x), walks(x)])",
-        r"\x y.([],[sees(x,y)])",
-        r"([],[(([],[walks(x)]) + ([],[runs(x)]))])",
-        r"([x],[man(x), -([],[walks(x)])])",
-        r"([],[(([x],[man(x)]) -> ([],[walks(x)]))])",
+        r'x',
+        r'([],[])',
+        r'([x],[])',
+        r'([x],[man(x)])',
+        r'([x,y],[sees(x,y)])',
+        r'([x],[man(x), walks(x)])',
+        r'\x.([],[man(x), walks(x)])',
+        r'\x y.([],[sees(x,y)])',
+        r'([],[(([],[walks(x)]) + ([],[runs(x)]))])',
+        r'([x],[man(x), -([],[walks(x)])])',
+        r'([],[(([x],[man(x)]) -> ([],[walks(x)]))])',
     ]
 
     for e in expressions:
@@ -1451,5 +1459,5 @@ def test_draw():
         d.draw()
 
 
-if __name__ == "__main__":
+if __name__ == '__main__':
     demo()
index 61a4f5b..4a45325 100644 (file)
@@ -3,12 +3,12 @@
 #
 # Author: Dan Garrette <dhgarrette@gmail.com>
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # URL: <http://nltk.org/>
 # For license information, see LICENSE.TXT
 
 try:
-    from tkinter import (
+    from six.moves.tkinter import (
         Button,
         Frame,
         IntVar,
@@ -18,7 +18,7 @@ try:
         Scrollbar,
         Tk,
     )
-    from tkinter.font import Font
+    from six.moves.tkinter_font import Font
     from nltk.draw.util import CanvasFrame, ShowText
 
 except ImportError:
@@ -36,7 +36,7 @@ class DrtGlueDemo(object):
     def __init__(self, examples):
         # Set up the main window.
         self._top = Tk()
-        self._top.title("DRT Glue Demo")
+        self._top.title('DRT Glue Demo')
 
         # Set up key bindings.
         self._init_bindings()
@@ -68,7 +68,7 @@ class DrtGlueDemo(object):
         self._init_canvas(self._top)
 
         # Resize callback
-        self._canvas.bind("<Configure>", self._configure)
+        self._canvas.bind('<Configure>', self._configure)
 
     #########################################
     ##  Initialization Helpers
@@ -77,17 +77,17 @@ class DrtGlueDemo(object):
     def _init_glue(self):
         tagger = RegexpTagger(
             [
-                ("^(David|Mary|John)$", "NNP"),
+                ('^(David|Mary|John)$', 'NNP'),
                 (
-                    "^(walks|sees|eats|chases|believes|gives|sleeps|chases|persuades|tries|seems|leaves)$",
-                    "VB",
+                    '^(walks|sees|eats|chases|believes|gives|sleeps|chases|persuades|tries|seems|leaves)$',
+                    'VB',
                 ),
-                ("^(go|order|vanish|find|approach)$", "VB"),
-                ("^(a)$", "ex_quant"),
-                ("^(every)$", "univ_quant"),
-                ("^(sandwich|man|dog|pizza|unicorn|cat|senator)$", "NN"),
-                ("^(big|gray|former)$", "JJ"),
-                ("^(him|himself)$", "PRP"),
+                ('^(go|order|vanish|find|approach)$', 'VB'),
+                ('^(a)$', 'ex_quant'),
+                ('^(every)$', 'univ_quant'),
+                ('^(sandwich|man|dog|pizza|unicorn|cat|senator)$', 'NN'),
+                ('^(big|gray|former)$', 'JJ'),
+                ('^(him|himself)$', 'PRP'),
             ]
         )
 
@@ -101,134 +101,134 @@ class DrtGlueDemo(object):
 
         # TWhat's our font size (default=same as sysfont)
         self._size = IntVar(root)
-        self._size.set(self._sysfont.cget("size"))
+        self._size.set(self._sysfont.cget('size'))
 
-        self._boldfont = Font(family="helvetica", weight="bold", size=self._size.get())
-        self._font = Font(family="helvetica", size=self._size.get())
+        self._boldfont = Font(family='helvetica', weight='bold', size=self._size.get())
+        self._font = Font(family='helvetica', size=self._size.get())
         if self._size.get() < 0:
             big = self._size.get() - 2
         else:
             big = self._size.get() + 2
-        self._bigfont = Font(family="helvetica", weight="bold", size=big)
+        self._bigfont = Font(family='helvetica', weight='bold', size=big)
 
     def _init_exampleListbox(self, parent):
         self._exampleFrame = listframe = Frame(parent)
-        self._exampleFrame.pack(fill="both", side="left", padx=2)
+        self._exampleFrame.pack(fill='both', side='left', padx=2)
         self._exampleList_label = Label(
-            self._exampleFrame, font=self._boldfont, text="Examples"
+            self._exampleFrame, font=self._boldfont, text='Examples'
         )
         self._exampleList_label.pack()
         self._exampleList = Listbox(
             self._exampleFrame,
-            selectmode="single",
-            relief="groove",
-            background="white",
-            foreground="#909090",
+            selectmode='single',
+            relief='groove',
+            background='white',
+            foreground='#909090',
             font=self._font,
-            selectforeground="#004040",
-            selectbackground="#c0f0c0",
+            selectforeground='#004040',
+            selectbackground='#c0f0c0',
         )
 
-        self._exampleList.pack(side="right", fill="both", expand=1)
+        self._exampleList.pack(side='right', fill='both', expand=1)
 
         for example in self._examples:
-            self._exampleList.insert("end", ("  %s" % example))
+            self._exampleList.insert('end', ('  %s' % example))
         self._exampleList.config(height=min(len(self._examples), 25), width=40)
 
         # Add a scrollbar if there are more than 25 examples.
         if len(self._examples) > 25:
-            listscroll = Scrollbar(self._exampleFrame, orient="vertical")
+            listscroll = Scrollbar(self._exampleFrame, orient='vertical')
             self._exampleList.config(yscrollcommand=listscroll.set)
             listscroll.config(command=self._exampleList.yview)
-            listscroll.pack(side="left", fill="y")
+            listscroll.pack(side='left', fill='y')
 
         # If they select a example, apply it.
-        self._exampleList.bind("<<ListboxSelect>>", self._exampleList_select)
+        self._exampleList.bind('<<ListboxSelect>>', self._exampleList_select)
 
     def _init_readingListbox(self, parent):
         self._readingFrame = listframe = Frame(parent)
-        self._readingFrame.pack(fill="both", side="left", padx=2)
+        self._readingFrame.pack(fill='both', side='left', padx=2)
         self._readingList_label = Label(
-            self._readingFrame, font=self._boldfont, text="Readings"
+            self._readingFrame, font=self._boldfont, text='Readings'
         )
         self._readingList_label.pack()
         self._readingList = Listbox(
             self._readingFrame,
-            selectmode="single",
-            relief="groove",
-            background="white",
-            foreground="#909090",
+            selectmode='single',
+            relief='groove',
+            background='white',
+            foreground='#909090',
             font=self._font,
-            selectforeground="#004040",
-            selectbackground="#c0f0c0",
+            selectforeground='#004040',
+            selectbackground='#c0f0c0',
         )
 
-        self._readingList.pack(side="right", fill="both", expand=1)
+        self._readingList.pack(side='right', fill='both', expand=1)
 
         # Add a scrollbar if there are more than 25 examples.
-        listscroll = Scrollbar(self._readingFrame, orient="vertical")
+        listscroll = Scrollbar(self._readingFrame, orient='vertical')
         self._readingList.config(yscrollcommand=listscroll.set)
         listscroll.config(command=self._readingList.yview)
-        listscroll.pack(side="right", fill="y")
+        listscroll.pack(side='right', fill='y')
 
         self._populate_readingListbox()
 
     def _populate_readingListbox(self):
         # Populate the listbox with integers
-        self._readingList.delete(0, "end")
+        self._readingList.delete(0, 'end')
         for i in range(len(self._readings)):
-            self._readingList.insert("end", ("  %s" % (i + 1)))
+            self._readingList.insert('end', ('  %s' % (i + 1)))
         self._readingList.config(height=min(len(self._readings), 25), width=5)
 
         # If they select a example, apply it.
-        self._readingList.bind("<<ListboxSelect>>", self._readingList_select)
+        self._readingList.bind('<<ListboxSelect>>', self._readingList_select)
 
     def _init_bindings(self):
         # Key bindings are a good thing.
-        self._top.bind("<Control-q>", self.destroy)
-        self._top.bind("<Control-x>", self.destroy)
-        self._top.bind("<Escape>", self.destroy)
-        self._top.bind("n", self.next)
-        self._top.bind("<space>", self.next)
-        self._top.bind("p", self.prev)
-        self._top.bind("<BackSpace>", self.prev)
+        self._top.bind('<Control-q>', self.destroy)
+        self._top.bind('<Control-x>', self.destroy)
+        self._top.bind('<Escape>', self.destroy)
+        self._top.bind('n', self.next)
+        self._top.bind('<space>', self.next)
+        self._top.bind('p', self.prev)
+        self._top.bind('<BackSpace>', self.prev)
 
     def _init_buttons(self, parent):
         # Set up the frames.
         self._buttonframe = buttonframe = Frame(parent)
-        buttonframe.pack(fill="none", side="bottom", padx=3, pady=2)
+        buttonframe.pack(fill='none', side='bottom', padx=3, pady=2)
         Button(
             buttonframe,
-            text="Prev",
-            background="#90c0d0",
-            foreground="black",
+            text='Prev',
+            background='#90c0d0',
+            foreground='black',
             command=self.prev,
-        ).pack(side="left")
+        ).pack(side='left')
         Button(
             buttonframe,
-            text="Next",
-            background="#90c0d0",
-            foreground="black",
+            text='Next',
+            background='#90c0d0',
+            foreground='black',
             command=self.next,
-        ).pack(side="left")
+        ).pack(side='left')
 
     def _configure(self, event):
         self._autostep = 0
         (x1, y1, x2, y2) = self._cframe.scrollregion()
         y2 = event.height - 6
-        self._canvas["scrollregion"] = "%d %d %d %d" % (x1, y1, x2, y2)
+        self._canvas['scrollregion'] = '%d %d %d %d' % (x1, y1, x2, y2)
         self._redraw()
 
     def _init_canvas(self, parent):
         self._cframe = CanvasFrame(
             parent,
-            background="white",
+            background='white',
             # width=525, height=250,
             closeenough=10,
             border=2,
-            relief="sunken",
+            relief='sunken',
         )
-        self._cframe.pack(expand=1, fill="both", side="top", pady=2)
+        self._cframe.pack(expand=1, fill='both', side='top', pady=2)
         canvas = self._canvas = self._cframe.canvas()
 
         # Initially, there's no tree or text
@@ -241,70 +241,70 @@ class DrtGlueDemo(object):
 
         filemenu = Menu(menubar, tearoff=0)
         filemenu.add_command(
-            label="Exit", underline=1, command=self.destroy, accelerator="q"
+            label='Exit', underline=1, command=self.destroy, accelerator='q'
         )
-        menubar.add_cascade(label="File", underline=0, menu=filemenu)
+        menubar.add_cascade(label='File', underline=0, menu=filemenu)
 
         actionmenu = Menu(menubar, tearoff=0)
         actionmenu.add_command(
-            label="Next", underline=0, command=self.next, accelerator="n, Space"
+            label='Next', underline=0, command=self.next, accelerator='n, Space'
         )
         actionmenu.add_command(
-            label="Previous", underline=0, command=self.prev, accelerator="p, Backspace"
+            label='Previous', underline=0, command=self.prev, accelerator='p, Backspace'
         )
-        menubar.add_cascade(label="Action", underline=0, menu=actionmenu)
+        menubar.add_cascade(label='Action', underline=0, menu=actionmenu)
 
         optionmenu = Menu(menubar, tearoff=0)
         optionmenu.add_checkbutton(
-            label="Remove Duplicates",
+            label='Remove Duplicates',
             underline=0,
             variable=self._glue.remove_duplicates,
             command=self._toggle_remove_duplicates,
-            accelerator="r",
+            accelerator='r',
         )
-        menubar.add_cascade(label="Options", underline=0, menu=optionmenu)
+        menubar.add_cascade(label='Options', underline=0, menu=optionmenu)
 
         viewmenu = Menu(menubar, tearoff=0)
         viewmenu.add_radiobutton(
-            label="Tiny",
+            label='Tiny',
             variable=self._size,
             underline=0,
             value=10,
             command=self.resize,
         )
         viewmenu.add_radiobutton(
-            label="Small",
+            label='Small',
             variable=self._size,
             underline=0,
             value=12,
             command=self.resize,
         )
         viewmenu.add_radiobutton(
-            label="Medium",
+            label='Medium',
             variable=self._size,
             underline=0,
             value=14,
             command=self.resize,
         )
         viewmenu.add_radiobutton(
-            label="Large",
+            label='Large',
             variable=self._size,
             underline=0,
             value=18,
             command=self.resize,
         )
         viewmenu.add_radiobutton(
-            label="Huge",
+            label='Huge',
             variable=self._size,
             underline=0,
             value=24,
             command=self.resize,
         )
-        menubar.add_cascade(label="View", underline=0, menu=viewmenu)
+        menubar.add_cascade(label='View', underline=0, menu=viewmenu)
 
         helpmenu = Menu(menubar, tearoff=0)
-        helpmenu.add_command(label="About", underline=0, command=self.about)
-        menubar.add_cascade(label="Help", underline=0, menu=helpmenu)
+        helpmenu.add_command(label='About', underline=0, command=self.about)
+        menubar.add_cascade(label='Help', underline=0, menu=helpmenu)
 
         parent.config(menu=menubar)
 
@@ -405,9 +405,9 @@ class DrtGlueDemo(object):
             "NLTK Discourse Representation Theory (DRT) Glue Semantics Demo\n"
             + "Written by Daniel H. Garrette"
         )
-        TITLE = "About: NLTK DRT Glue Demo"
+        TITLE = 'About: NLTK DRT Glue Demo'
         try:
-            from tkinter.messagebox import Message
+            from six.moves.tkinter_messagebox import Message
 
             Message(message=ABOUT, title=TITLE).show()
         except:
@@ -441,7 +441,7 @@ class DrtGlueDemo(object):
     def _toggle_remove_duplicates(self):
         self._glue.remove_duplicates = not self._glue.remove_duplicates
 
-        self._exampleList.selection_clear(0, "end")
+        self._exampleList.selection_clear(0, 'end')
         self._readings = []
         self._populate_readingListbox()
         self._readingCache = [None for ex in self._examples]
@@ -461,7 +461,7 @@ class DrtGlueDemo(object):
         self._curExample = index
         example = self._examples[index]
 
-        self._exampleList.selection_clear(0, "end")
+        self._exampleList.selection_clear(0, 'end')
         if example:
             cache = self._readingCache[index]
             if cache:
@@ -478,12 +478,12 @@ class DrtGlueDemo(object):
                     self._readingCache[index] = self._readings
                 except Exception as e:
                     self._readings = []
-                    self._error = DrtVariableExpression(Variable("Error: " + str(e)))
+                    self._error = DrtVariableExpression(Variable('Error: ' + str(e)))
                     self._readingCache[index] = self._error
 
                     # add a star to the end of the example
                     self._exampleList.delete(index)
-                    self._exampleList.insert(index, ("  %s *" % example))
+                    self._exampleList.insert(index, ('  %s *' % example))
                     self._exampleList.config(
                         height=min(len(self._examples), 25), width=40
                     )
@@ -504,7 +504,7 @@ class DrtGlueDemo(object):
     def _readingList_store_selection(self, index):
         reading = self._readings[index]
 
-        self._readingList.selection_clear(0, "end")
+        self._readingList.selection_clear(0, 'end')
         if reading:
             self._readingList.selection_set(index)
 
@@ -518,7 +518,7 @@ class DrsWidget(object):
         self._drs = drs
         self._canvas = canvas
         canvas.font = Font(
-            font=canvas.itemcget(canvas.create_text(0, 0, text=""), "font")
+            font=canvas.itemcget(canvas.create_text(0, 0, text=''), 'font')
         )
         canvas._BUFFER = 3
         self.bbox = (0, 0, 0, 0)
@@ -533,13 +533,13 @@ class DrsWidget(object):
 
 def demo():
     examples = [
-        "John walks",
-        "David sees Mary",
-        "David eats a sandwich",
-        "every man chases a dog",
+        'John walks',
+        'David sees Mary',
+        'David eats a sandwich',
+        'every man chases a dog',
         #                'every man believes a dog yawns',
         #                'John gives David a sandwich',
-        "John chases himself",
+        'John chases himself',
         #                'John persuades David to order a pizza',
         #                'John tries to go',
         #                'John tries to find a unicorn',
@@ -557,5 +557,5 @@ def demo():
     DrtGlueDemo(examples).mainloop()
 
 
-if __name__ == "__main__":
+if __name__ == '__main__':
     demo()
index 3a1eab0..adc0716 100644 (file)
@@ -1,6 +1,6 @@
 # Natural Language Toolkit: Models for first-order languages with lambda
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Ewan Klein <ewan@inf.ed.ac.uk>,
 # URL: <http://nltk.sourceforge.net>
 # For license information, see LICENSE.TXT
@@ -13,6 +13,7 @@
 This module provides data structures for representing first-order
 models.
 """
+from __future__ import print_function, unicode_literals
 
 from pprint import pformat
 import inspect
@@ -20,7 +21,10 @@ import textwrap
 import re
 import sys
 
+from six import string_types
+
 from nltk.decorators import decorator  # this used in code that is commented out
+from nltk.compat import python_2_unicode_compatible
 
 from nltk.sem.logic import (
     AbstractVariableExpression,
@@ -50,9 +54,12 @@ class Undefined(Error):
 
 
 def trace(f, *args, **kw):
-    argspec = inspect.getfullargspec(f)
+    if sys.version_info[0] >= 3:
+        argspec = inspect.getfullargspec(f)
+    else:
+        argspec = inspect.getargspec(f)
     d = dict(zip(argspec[0], args))
-    if d.pop("trace", None):
+    if d.pop('trace', None):
         print()
         for item in d.items():
             print("%s => %s" % item)
@@ -92,7 +99,7 @@ def set2rel(s):
     """
     new = set()
     for elem in s:
-        if isinstance(elem, str):
+        if isinstance(elem, string_types):
             new.add((elem,))
         elif isinstance(elem, int):
             new.add((str(elem)))
@@ -112,6 +119,7 @@ def arity(rel):
     return len(list(rel)[0])
 
 
+@python_2_unicode_compatible
 class Valuation(dict):
     """
     A dictionary which represents a model-theoretic Valuation of non-logical constants.
@@ -130,7 +138,7 @@ class Valuation(dict):
         """
         super(Valuation, self).__init__()
         for (sym, val) in xs:
-            if isinstance(val, str) or isinstance(val, bool):
+            if isinstance(val, string_types) or isinstance(val, bool):
                 self[sym] = val
             elif isinstance(val, set):
                 self[sym] = set2rel(val)
@@ -157,7 +165,7 @@ class Valuation(dict):
         """Set-theoretic domain of the value-space of a Valuation."""
         dom = []
         for val in self.values():
-            if isinstance(val, str):
+            if isinstance(val, string_types):
                 dom.append(val)
             elif not isinstance(val, bool):
                 dom.extend(
@@ -178,8 +186,8 @@ class Valuation(dict):
 ##########################################
 # REs used by the _read_valuation function
 ##########################################
-_VAL_SPLIT_RE = re.compile(r"\s*=+>\s*")
-_ELEMENT_SPLIT_RE = re.compile(r"\s*,\s*")
+_VAL_SPLIT_RE = re.compile(r'\s*=+>\s*')
+_ELEMENT_SPLIT_RE = re.compile(r'\s*,\s*')
 _TUPLES_RE = re.compile(
     r"""\s*
                                 (\([^)]+\))  # tuple-expression
@@ -207,7 +215,7 @@ def _read_valuation_line(s):
     symbol = pieces[0]
     value = pieces[1]
     # check whether the value is meant to be a set
-    if value.startswith("{"):
+    if value.startswith('{'):
         value = value[1:-1]
         tuple_strings = _TUPLES_RE.findall(value)
         # are the set elements tuples?
@@ -239,15 +247,16 @@ def read_valuation(s, encoding=None):
     statements = []
     for linenum, line in enumerate(s.splitlines()):
         line = line.strip()
-        if line.startswith("#") or line == "":
+        if line.startswith('#') or line == '':
             continue
         try:
             statements.append(_read_valuation_line(line))
         except ValueError:
-            raise ValueError("Unable to parse line %s: %s" % (linenum, line))
+            raise ValueError('Unable to parse line %s: %s' % (linenum, line))
     return Valuation(statements)
 
 
+@python_2_unicode_compatible
 class Assignment(dict):
     """
     A dictionary which represents an assignment of values to variables.
@@ -377,6 +386,7 @@ class Assignment(dict):
         return self
 
 
+@python_2_unicode_compatible
 class Model(object):
     """
     A first order model is a domain *D* of discourse and a valuation *V*.
@@ -431,7 +441,7 @@ class Model(object):
             if trace:
                 print()
                 print("'%s' is undefined under M, %s" % (expr, g))
-            return "Undefined"
+            return 'Undefined'
 
     def satisfy(self, parsed, g, trace=None):
         """
@@ -541,11 +551,11 @@ class Model(object):
         :return: a set of the entities that satisfy ``parsed``.
         """
 
-        spacer = "   "
+        spacer = '   '
         indent = spacer + (spacer * nesting)
         candidates = []
 
-        if isinstance(varex, str):
+        if isinstance(varex, string_types):
             var = Variable(varex)
         else:
             var = varex
@@ -605,37 +615,37 @@ def propdemo(trace=None):
     """Example of a propositional model."""
 
     global val1, dom1, m1, g1
-    val1 = Valuation([("P", True), ("Q", True), ("R", False)])
+    val1 = Valuation([('P', True), ('Q', True), ('R', False)])
     dom1 = set([])
     m1 = Model(dom1, val1)
     g1 = Assignment(dom1)
 
     print()
-    print("*" * mult)
+    print('*' * mult)
     print("Propositional Formulas Demo")
-    print("*" * mult)
-    print("(Propositional constants treated as nullary predicates)")
+    print('*' * mult)
+    print('(Propositional constants treated as nullary predicates)')
     print()
     print("Model m1:\n", m1)
-    print("*" * mult)
+    print('*' * mult)
     sentences = [
-        "(P & Q)",
-        "(P & R)",
-        "- P",
-        "- R",
-        "- - P",
-        "- (P & R)",
-        "(P | R)",
-        "(R | P)",
-        "(R | R)",
-        "(- P | R)",
-        "(P | - P)",
-        "(P -> Q)",
-        "(P -> R)",
-        "(R -> P)",
-        "(P <-> P)",
-        "(R <-> R)",
-        "(P <-> R)",
+        '(P & Q)',
+        '(P & R)',
+        '- P',
+        '- R',
+        '- - P',
+        '- (P & R)',
+        '(P | R)',
+        '(R | P)',
+        '(R | R)',
+        '(- P | R)',
+        '(P | - P)',
+        '(P -> Q)',
+        '(P -> R)',
+        '(R -> P)',
+        '(P <-> P)',
+        '(R <-> R)',
+        '(P <-> R)',
     ]
 
     for sent in sentences:
@@ -656,28 +666,28 @@ def folmodel(quiet=False, trace=None):
     global val2, v2, dom2, m2, g2
 
     v2 = [
-        ("adam", "b1"),
-        ("betty", "g1"),
-        ("fido", "d1"),
-        ("girl", set(["g1", "g2"])),
-        ("boy", set(["b1", "b2"])),
-        ("dog", set(["d1"])),
-        ("love", set([("b1", "g1"), ("b2", "g2"), ("g1", "b1"), ("g2", "b1")])),
+        ('adam', 'b1'),
+        ('betty', 'g1'),
+        ('fido', 'd1'),
+        ('girl', set(['g1', 'g2'])),
+        ('boy', set(['b1', 'b2'])),
+        ('dog', set(['d1'])),
+        ('love', set([('b1', 'g1'), ('b2', 'g2'), ('g1', 'b1'), ('g2', 'b1')])),
     ]
     val2 = Valuation(v2)
     dom2 = val2.domain
     m2 = Model(dom2, val2)
-    g2 = Assignment(dom2, [("x", "b1"), ("y", "g2")])
+    g2 = Assignment(dom2, [('x', 'b1'), ('y', 'g2')])
 
     if not quiet:
         print()
-        print("*" * mult)
+        print('*' * mult)
         print("Models Demo")
         print("*" * mult)
         print("Model m2:\n", "-" * 14, "\n", m2)
         print("Variable assignment = ", g2)
 
-        exprs = ["adam", "boy", "love", "walks", "x", "y", "z"]
+        exprs = ['adam', 'boy', 'love', 'walks', 'x', 'y', 'z']
         parsed_exprs = [Expression.fromstring(e) for e in exprs]
 
         print()
@@ -691,10 +701,10 @@ def folmodel(quiet=False, trace=None):
                 print("The interpretation of '%s' in m2 is Undefined" % parsed)
 
         applications = [
-            ("boy", ("adam")),
-            ("walks", ("adam",)),
-            ("love", ("adam", "y")),
-            ("love", ("y", "adam")),
+            ('boy', ('adam')),
+            ('walks', ('adam',)),
+            ('love', ('adam', 'y')),
+            ('love', ('y', 'adam')),
         ]
 
         for (fun, args) in applications:
@@ -717,29 +727,29 @@ def foldemo(trace=None):
     folmodel(quiet=True)
 
     print()
-    print("*" * mult)
+    print('*' * mult)
     print("FOL Formulas Demo")
-    print("*" * mult)
+    print('*' * mult)
 
     formulas = [
-        "love (adam, betty)",
-        "(adam = mia)",
-        "\\x. (boy(x) | girl(x))",
-        "\\x. boy(x)(adam)",
-        "\\x y. love(x, y)",
-        "\\x y. love(x, y)(adam)(betty)",
-        "\\x y. love(x, y)(adam, betty)",
-        "\\x y. (boy(x) & love(x, y))",
-        "\\x. exists y. (boy(x) & love(x, y))",
-        "exists z1. boy(z1)",
-        "exists x. (boy(x) &  -(x = adam))",
-        "exists x. (boy(x) & all y. love(y, x))",
-        "all x. (boy(x) | girl(x))",
-        "all x. (girl(x) -> exists y. boy(y) & love(x, y))",  # Every girl loves exists boy.
-        "exists x. (boy(x) & all y. (girl(y) -> love(y, x)))",  # There is exists boy that every girl loves.
-        "exists x. (boy(x) & all y. (girl(y) -> love(x, y)))",  # exists boy loves every girl.
-        "all x. (dog(x) -> - girl(x))",
-        "exists x. exists y. (love(x, y) & love(x, y))",
+        'love (adam, betty)',
+        '(adam = mia)',
+        '\\x. (boy(x) | girl(x))',
+        '\\x. boy(x)(adam)',
+        '\\x y. love(x, y)',
+        '\\x y. love(x, y)(adam)(betty)',
+        '\\x y. love(x, y)(adam, betty)',
+        '\\x y. (boy(x) & love(x, y))',
+        '\\x. exists y. (boy(x) & love(x, y))',
+        'exists z1. boy(z1)',
+        'exists x. (boy(x) &  -(x = adam))',
+        'exists x. (boy(x) & all y. love(y, x))',
+        'all x. (boy(x) | girl(x))',
+        'all x. (girl(x) -> exists y. boy(y) & love(x, y))',  # Every girl loves exists boy.
+        'exists x. (boy(x) & all y. (girl(y) -> love(y, x)))',  # There is exists boy that every girl loves.
+        'exists x. (boy(x) & all y. (girl(y) -> love(x, y)))',  # exists boy loves every girl.
+        'all x. (dog(x) -> - girl(x))',
+        'exists x. exists y. (love(x, y) & love(x, y))',
     ]
 
     for fmla in formulas:
@@ -758,32 +768,32 @@ def satdemo(trace=None):
     """Satisfiers of an open formula in a first order model."""
 
     print()
-    print("*" * mult)
+    print('*' * mult)
     print("Satisfiers Demo")
-    print("*" * mult)
+    print('*' * mult)
 
     folmodel(quiet=True)
 
     formulas = [
-        "boy(x)",
-        "(x = x)",
-        "(boy(x) | girl(x))",
-        "(boy(x) & girl(x))",
-        "love(adam, x)",
-        "love(x, adam)",
-        "-(x = adam)",
-        "exists z22. love(x, z22)",
-        "exists y. love(y, x)",
-        "all y. (girl(y) -> love(x, y))",
-        "all y. (girl(y) -> love(y, x))",
-        "all y. (girl(y) -> (boy(x) & love(y, x)))",
-        "(boy(x) & all y. (girl(y) -> love(x, y)))",
-        "(boy(x) & all y. (girl(y) -> love(y, x)))",
-        "(boy(x) & exists y. (girl(y) & love(y, x)))",
-        "(girl(x) -> dog(x))",
-        "all y. (dog(y) -> (x = y))",
-        "exists y. love(y, x)",
-        "exists y. (love(adam, y) & love(y, x))",
+        'boy(x)',
+        '(x = x)',
+        '(boy(x) | girl(x))',
+        '(boy(x) & girl(x))',
+        'love(adam, x)',
+        'love(x, adam)',
+        '-(x = adam)',
+        'exists z22. love(x, z22)',
+        'exists y. love(y, x)',
+        'all y. (girl(y) -> love(x, y))',
+        'all y. (girl(y) -> love(y, x))',
+        'all y. (girl(y) -> (boy(x) & love(y, x)))',
+        '(boy(x) & all y. (girl(y) -> love(x, y)))',
+        '(boy(x) & all y. (girl(y) -> love(y, x)))',
+        '(boy(x) & exists y. (girl(y) & love(y, x)))',
+        '(girl(x) -> dog(x))',
+        'all y. (dog(y) -> (x = y))',
+        'exists y. love(y, x)',
+        'exists y. (love(adam, y) & love(y, x))',
     ]
 
     if trace:
@@ -797,7 +807,7 @@ def satdemo(trace=None):
 
     for p in parsed:
         g2.purge()
-        print("The satisfiers of '%s' are: %s" % (p, m2.satisfiers(p, "x", g2, trace)))
+        print("The satisfiers of '%s' are: %s" % (p, m2.satisfiers(p, 'x', g2, trace)))
 
 
 def demo(num=0, trace=None):
index 684c90c..9fd3cab 100644 (file)
@@ -2,13 +2,16 @@
 #
 # Author: Dan Garrette <dhgarrette@gmail.com>
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # URL: <http://nltk.org/>
 # For license information, see LICENSE.TXT
+from __future__ import print_function, division, unicode_literals
 
 import os
 from itertools import chain
 
+from six import string_types
+
 import nltk
 from nltk.internals import Counter
 from nltk.tag import UnigramTagger, BigramTagger, TrigramTagger, RegexpTagger
@@ -19,43 +22,45 @@ from nltk.sem.logic import (
     LambdaExpression,
     AbstractVariableExpression,
 )
+from nltk.compat import python_2_unicode_compatible
 from nltk.sem import drt
 from nltk.sem import linearlogic
 
 SPEC_SEMTYPES = {
-    "a": "ex_quant",
-    "an": "ex_quant",
-    "every": "univ_quant",
-    "the": "def_art",
-    "no": "no_quant",
-    "default": "ex_quant",
+    'a': 'ex_quant',
+    'an': 'ex_quant',
+    'every': 'univ_quant',
+    'the': 'def_art',
+    'no': 'no_quant',
+    'default': 'ex_quant',
 }
 
-OPTIONAL_RELATIONSHIPS = ["nmod", "vmod", "punct"]
+OPTIONAL_RELATIONSHIPS = ['nmod', 'vmod', 'punct']
 
 
+@python_2_unicode_compatible
 class GlueFormula(object):
     def __init__(self, meaning, glue, indices=None):
         if not indices:
             indices = set()
 
-        if isinstance(meaning, str):
+        if isinstance(meaning, string_types):
             self.meaning = Expression.fromstring(meaning)
         elif isinstance(meaning, Expression):
             self.meaning = meaning
         else:
             raise RuntimeError(
-                "Meaning term neither string or expression: %s, %s"
+                'Meaning term neither string or expression: %s, %s'
                 % (meaning, meaning.__class__)
             )
 
-        if isinstance(glue, str):
+        if isinstance(glue, string_types):
             self.glue = linearlogic.LinearLogicParser().parse(glue)
         elif isinstance(glue, linearlogic.Expression):
             self.glue = glue
         else:
             raise RuntimeError(
-                "Glue term neither string or expression: %s, %s"
+                'Glue term neither string or expression: %s, %s'
                 % (glue, glue.__class__)
             )
 
@@ -88,7 +93,7 @@ class GlueFormula(object):
                 ::-1
             ]:  # if self.glue is (A -o B), dep is in A.dependencies
                 arg_meaning_abstracted = self.make_LambdaExpression(
-                    Variable("v%s" % dep), arg_meaning_abstracted
+                    Variable('v%s' % dep), arg_meaning_abstracted
                 )
         return_meaning = self.meaning.applyto(arg_meaning_abstracted)
 
@@ -140,15 +145,16 @@ class GlueFormula(object):
 
     def __str__(self):
         assert isinstance(self.indices, set)
-        accum = "%s : %s" % (self.meaning, self.glue)
+        accum = '%s : %s' % (self.meaning, self.glue)
         if self.indices:
-            accum += " : {" + ", ".join(str(index) for index in self.indices) + "}"
+            accum += ' : {' + ', '.join(str(index) for index in self.indices) + '}'
         return accum
 
     def __repr__(self):
         return "%s" % self
 
 
+@python_2_unicode_compatible
 class GlueDict(dict):
     def __init__(self, filename, encoding=None):
         self.filename = filename
@@ -161,13 +167,13 @@ class GlueDict(dict):
 
         try:
             contents = nltk.data.load(
-                self.filename, format="text", encoding=self.file_encoding
+                self.filename, format='text', encoding=self.file_encoding
             )
             # TODO: the above can't handle zip files, but this should anyway be fixed in nltk.data.load()
         except LookupError as e:
             try:
                 contents = nltk.data.load(
-                    "file:" + self.filename, format="text", encoding=self.file_encoding
+                    'file:' + self.filename, format='text', encoding=self.file_encoding
                 )
             except LookupError:
                 raise e
@@ -178,11 +184,11 @@ class GlueDict(dict):
             line = line.strip()  # remove trailing newline
             if not len(line):
                 continue  # skip empty lines
-            if line[0] == "#":
+            if line[0] == '#':
                 continue  # skip commented out lines
 
             parts = line.split(
-                " : ", 2
+                ' : ', 2
             )  # ['verb', '(\\x.(<word> x), ( subj -o f ))', '[subj]']
 
             glue_formulas = []
@@ -194,11 +200,11 @@ class GlueDict(dict):
 
             if len(parts) > 1:
                 for (i, c) in enumerate(parts[1]):
-                    if c == "(":
+                    if c == '(':
                         if paren_count == 0:  # if it's the first '(' of a tuple
                             tuple_start = i + 1  # then save the index
                         paren_count += 1
-                    elif c == ")":
+                    elif c == ')':
                         paren_count -= 1
                         if paren_count == 0:  # if it's the last ')' of a tuple
                             meaning_term = parts[1][
@@ -208,33 +214,33 @@ class GlueDict(dict):
                             glue_formulas.append(
                                 [meaning_term, glue_term]
                             )  # add the GlueFormula to the list
-                    elif c == ",":
+                    elif c == ',':
                         if (
                             paren_count == 1
                         ):  # if it's a comma separating the parts of the tuple
                             tuple_comma = i  # then save the index
-                    elif c == "#":  # skip comments at the ends of lines
+                    elif c == '#':  # skip comments at the ends of lines
                         if (
                             paren_count != 0
                         ):  # if the line hasn't parsed correctly so far
                             raise RuntimeError(
-                                "Formula syntax is incorrect for entry " + line
+                                'Formula syntax is incorrect for entry ' + line
                             )
                         break  # break to the next line
 
             if len(parts) > 2:  # if there is a relationship entry at the end
-                rel_start = parts[2].index("[") + 1
-                rel_end = parts[2].index("]")
+                rel_start = parts[2].index('[') + 1
+                rel_end = parts[2].index(']')
                 if rel_start == rel_end:
                     relationships = frozenset()
                 else:
                     relationships = frozenset(
-                        r.strip() for r in parts[2][rel_start:rel_end].split(",")
+                        r.strip() for r in parts[2][rel_start:rel_end].split(',')
                     )
 
             try:
-                start_inheritance = parts[0].index("(")
-                end_inheritance = parts[0].index(")")
+                start_inheritance = parts[0].index('(')
+                end_inheritance = parts[0].index(')')
                 sem = parts[0][:start_inheritance].strip()
                 supertype = parts[0][start_inheritance + 1 : end_inheritance]
             except:
@@ -273,20 +279,20 @@ class GlueDict(dict):
                 )  # add the glue entry to the dictionary
 
     def __str__(self):
-        accum = ""
+        accum = ''
         for pos in self:
             str_pos = "%s" % pos
             for relset in self[pos]:
                 i = 1
                 for gf in self[pos][relset]:
                     if i == 1:
-                        accum += str_pos + ": "
+                        accum += str_pos + ': '
                     else:
-                        accum += " " * (len(str_pos) + 2)
+                        accum += ' ' * (len(str_pos) + 2)
                     accum += "%s" % gf
                     if relset and i == len(self[pos][relset]):
-                        accum += " : %s" % relset
-                    accum += "\n"
+                        accum += ' : %s' % relset
+                    accum += '\n'
                     i += 1
         return accum
 
@@ -294,13 +300,13 @@ class GlueDict(dict):
         if node is None:
             # TODO: should it be depgraph.root? Is this code tested?
             top = depgraph.nodes[0]
-            depList = list(chain(*top["deps"].values()))
+            depList = list(chain(*top['deps'].values()))
             root = depgraph.nodes[depList[0]]
 
             return self.to_glueformula_list(depgraph, root, Counter(), verbose)
 
         glueformulas = self.lookup(node, depgraph, counter)
-        for dep_idx in chain(*node["deps"].values()):
+        for dep_idx in chain(*node['deps'].values()):
             dep = depgraph.nodes[dep_idx]
             glueformulas.extend(
                 self.to_glueformula_list(depgraph, dep, counter, verbose)
@@ -326,29 +332,29 @@ class GlueDict(dict):
         if not len(lookup):
             raise KeyError(
                 "There is no GlueDict entry for sem type of '%s' "
-                "with tag '%s', and rel '%s'" % (node["word"], node["tag"], node["rel"])
+                "with tag '%s', and rel '%s'" % (node['word'], node['tag'], node['rel'])
             )
 
         return self.get_glueformulas_from_semtype_entry(
-            lookup, node["word"], node, depgraph, counter
+            lookup, node['word'], node, depgraph, counter
         )
 
     def add_missing_dependencies(self, node, depgraph):
-        rel = node["rel"].lower()
-
-        if rel == "main":
-            headnode = depgraph.nodes[node["head"]]
-            subj = self.lookup_unique("subj", headnode, depgraph)
-            relation = subj["rel"]
-            node["deps"].setdefault(relation, [])
-            node["deps"][relation].append(subj["address"])
+        rel = node['rel'].lower()
+
+        if rel == 'main':
+            headnode = depgraph.nodes[node['head']]
+            subj = self.lookup_unique('subj', headnode, depgraph)
+            relation = subj['rel']
+            node['deps'].setdefault(relation, [])
+            node['deps'][relation].append(subj['address'])
             # node['deps'].append(subj['address'])
 
     def _lookup_semtype_option(self, semtype, node, depgraph):
         relationships = frozenset(
-            depgraph.nodes[dep]["rel"].lower()
-            for dep in chain(*node["deps"].values())
-            if depgraph.nodes[dep]["rel"].lower() not in OPTIONAL_RELATIONSHIPS
+            depgraph.nodes[dep]['rel'].lower()
+            for dep in chain(*node['deps'].values())
+            if depgraph.nodes[dep]['rel'].lower() not in OPTIONAL_RELATIONSHIPS
         )
 
         try:
@@ -379,18 +385,18 @@ class GlueDict(dict):
         Based on the node, return a list of plausible semtypes in order of
         plausibility.
         """
-        rel = node["rel"].lower()
-        word = node["word"].lower()
+        rel = node['rel'].lower()
+        word = node['word'].lower()
 
-        if rel == "spec":
+        if rel == 'spec':
             if word in SPEC_SEMTYPES:
                 return [SPEC_SEMTYPES[word]]
             else:
-                return [SPEC_SEMTYPES["default"]]
-        elif rel in ["nmod", "vmod"]:
-            return [node["tag"], rel]
+                return [SPEC_SEMTYPES['default']]
+        elif rel in ['nmod', 'vmod']:
+            return [node['tag'], rel]
         else:
-            return [node["tag"]]
+            return [node['tag']]
 
     def get_glueformulas_from_semtype_entry(
         self, lookup, word, node, depgraph, counter
@@ -403,7 +409,7 @@ class GlueDict(dict):
             if not len(glueformulas):
                 gf.word = word
             else:
-                gf.word = "%s%s" % (word, len(glueformulas) + 1)
+                gf.word = '%s%s' % (word, len(glueformulas) + 1)
 
             gf.glue = self.initialize_labels(gf.glue, node, depgraph, counter.get())
 
@@ -416,8 +422,8 @@ class GlueDict(dict):
         parameter "<word>"
         :param word: The actual word to be replace "<word>"
         """
-        word = word.replace(".", "")
-        return generic.replace("<word>", word)
+        word = word.replace('.', '')
+        return generic.replace('<word>', word)
 
     def initialize_labels(self, expr, node, depgraph, unique_index):
         if isinstance(expr, linearlogic.AtomicExpression):
@@ -434,13 +440,13 @@ class GlueDict(dict):
 
     def find_label_name(self, name, node, depgraph, unique_index):
         try:
-            dot = name.index(".")
+            dot = name.index('.')
 
             before_dot = name[:dot]
             after_dot = name[dot + 1 :]
-            if before_dot == "super":
+            if before_dot == 'super':
                 return self.find_label_name(
-                    after_dot, depgraph.nodes[node["head"]], depgraph, unique_index
+                    after_dot, depgraph.nodes[node['head']], depgraph, unique_index
                 )
             else:
                 return self.find_label_name(
@@ -451,20 +457,20 @@ class GlueDict(dict):
                 )
         except ValueError:
             lbl = self.get_label(node)
-            if name == "f":
+            if name == 'f':
                 return lbl
-            elif name == "v":
-                return "%sv" % lbl
-            elif name == "r":
-                return "%sr" % lbl
-            elif name == "super":
-                return self.get_label(depgraph.nodes[node["head"]])
-            elif name == "var":
-                return "%s%s" % (lbl.upper(), unique_index)
-            elif name == "a":
-                return self.get_label(self.lookup_unique("conja", node, depgraph))
-            elif name == "b":
-                return self.get_label(self.lookup_unique("conjb", node, depgraph))
+            elif name == 'v':
+                return '%sv' % lbl
+            elif name == 'r':
+                return '%sr' % lbl
+            elif name == 'super':
+                return self.get_label(depgraph.nodes[node['head']])
+            elif name == 'var':
+                return '%s%s' % (lbl.upper(), unique_index)
+            elif name == 'a':
+                return self.get_label(self.lookup_unique('conja', node, depgraph))
+            elif name == 'b':
+                return self.get_label(self.lookup_unique('conjb', node, depgraph))
             else:
                 return self.get_label(self.lookup_unique(name, node, depgraph))
 
@@ -475,35 +481,35 @@ class GlueDict(dict):
         :param value: where to index into the list of characters
         :type value: int
         """
-        value = node["address"]
+        value = node['address']
 
         letter = [
-            "f",
-            "g",
-            "h",
-            "i",
-            "j",
-            "k",
-            "l",
-            "m",
-            "n",
-            "o",
-            "p",
-            "q",
-            "r",
-            "s",
-            "t",
-            "u",
-            "v",
-            "w",
-            "x",
-            "y",
-            "z",
-            "a",
-            "b",
-            "c",
-            "d",
-            "e",
+            'f',
+            'g',
+            'h',
+            'i',
+            'j',
+            'k',
+            'l',
+            'm',
+            'n',
+            'o',
+            'p',
+            'q',
+            'r',
+            's',
+            't',
+            'u',
+            'v',
+            'w',
+            'x',
+            'y',
+            'z',
+            'a',
+            'b',
+            'c',
+            'd',
+            'e',
         ][value - 1]
         num = int(value) // 26
         if num > 0:
@@ -517,15 +523,15 @@ class GlueDict(dict):
         """
         deps = [
             depgraph.nodes[dep]
-            for dep in chain(*node["deps"].values())
-            if depgraph.nodes[dep]["rel"].lower() == rel.lower()
+            for dep in chain(*node['deps'].values())
+            if depgraph.nodes[dep]['rel'].lower() == rel.lower()
         ]
 
         if len(deps) == 0:
-            raise KeyError("'%s' doesn't contain a feature '%s'" % (node["word"], rel))
+            raise KeyError("'%s' doesn't contain a feature '%s'" % (node['word'], rel))
         elif len(deps) > 1:
             raise KeyError(
-                "'%s' should only have one feature '%s'" % (node["word"], rel)
+                "'%s' should only have one feature '%s'" % (node['word'], rel)
             )
         else:
             return deps[0]
@@ -550,7 +556,7 @@ class Glue(object):
             self.semtype_file = semtype_file
         else:
             self.semtype_file = os.path.join(
-                "grammars", "sample_grammars", "glue.semtype"
+                'grammars', 'sample_grammars', 'glue.semtype'
             )
 
     def train_depparser(self, depgraphs=None):
@@ -559,7 +565,7 @@ class Glue(object):
         else:
             self.depparser.train_from_file(
                 nltk.data.find(
-                    os.path.join("grammars", "sample_grammars", "glue_train.conll")
+                    os.path.join('grammars', 'sample_grammars', 'glue_train.conll')
                 )
             )
 
@@ -649,8 +655,8 @@ class Glue(object):
                     # if there is an exception, the syntax of the formula
                     # may not be understandable by the prover, so don't
                     # throw out the reading.
-                    print("Error when checking logical equality of statements", e)
-
+                    print('Error when checking logical equality of statements', e)
+                    
         if add_reading:
             reading_list.append(glueformula.meaning)
 
@@ -689,7 +695,7 @@ class Glue(object):
             return_list.extend(gf.compile(index_counter))
 
         if self.verbose:
-            print("Compiled Glue Premises:")
+            print('Compiled Glue Premises:')
             for cgf in return_list:
                 print(cgf)
 
@@ -700,25 +706,25 @@ class Glue(object):
 
         regexp_tagger = RegexpTagger(
             [
-                (r"^-?[0-9]+(.[0-9]+)?$", "CD"),  # cardinal numbers
-                (r"(The|the|A|a|An|an)$", "AT"),  # articles
-                (r".*able$", "JJ"),  # adjectives
-                (r".*ness$", "NN"),  # nouns formed from adjectives
-                (r".*ly$", "RB"),  # adverbs
-                (r".*s$", "NNS"),  # plural nouns
-                (r".*ing$", "VBG"),  # gerunds
-                (r".*ed$", "VBD"),  # past tense verbs
-                (r".*", "NN"),  # nouns (default)
+                (r'^-?[0-9]+(.[0-9]+)?$', 'CD'),  # cardinal numbers
+                (r'(The|the|A|a|An|an)$', 'AT'),  # articles
+                (r'.*able$', 'JJ'),  # adjectives
+                (r'.*ness$', 'NN'),  # nouns formed from adjectives
+                (r'.*ly$', 'RB'),  # adverbs
+                (r'.*s$', 'NNS'),  # plural nouns
+                (r'.*ing$', 'VBG'),  # gerunds
+                (r'.*ed$', 'VBD'),  # past tense verbs
+                (r'.*', 'NN'),  # nouns (default)
             ]
         )
-        brown_train = brown.tagged_sents(categories="news")
+        brown_train = brown.tagged_sents(categories='news')
         unigram_tagger = UnigramTagger(brown_train, backoff=regexp_tagger)
         bigram_tagger = BigramTagger(brown_train, backoff=unigram_tagger)
         trigram_tagger = TrigramTagger(brown_train, backoff=bigram_tagger)
 
         # Override particular words
         main_tagger = RegexpTagger(
-            [(r"(A|a|An|an)$", "ex_quant"), (r"(Every|every|All|all)$", "univ_quant")],
+            [(r'(A|a|An|an)$', 'ex_quant'), (r'(Every|every|All|all)$', 'univ_quant')],
             backoff=trigram_tagger,
         )
 
@@ -730,23 +736,23 @@ class DrtGlueFormula(GlueFormula):
         if not indices:
             indices = set()
 
-        if isinstance(meaning, str):
+        if isinstance(meaning, string_types):
             self.meaning = drt.DrtExpression.fromstring(meaning)
         elif isinstance(meaning, drt.DrtExpression):
             self.meaning = meaning
         else:
             raise RuntimeError(
-                "Meaning term neither string or expression: %s, %s"
+                'Meaning term neither string or expression: %s, %s'
                 % (meaning, meaning.__class__)
             )
 
-        if isinstance(glue, str):
+        if isinstance(glue, string_types):
             self.glue = linearlogic.LinearLogicParser().parse(glue)
         elif isinstance(glue, linearlogic.Expression):
             self.glue = glue
         else:
             raise RuntimeError(
-                "Glue term neither string or expression: %s, %s"
+                'Glue term neither string or expression: %s, %s'
                 % (glue, glue.__class__)
             )
 
@@ -770,7 +776,7 @@ class DrtGlue(Glue):
     ):
         if not semtype_file:
             semtype_file = os.path.join(
-                "grammars", "sample_grammars", "drt_glue.semtype"
+                'grammars', 'sample_grammars', 'drt_glue.semtype'
             )
         Glue.__init__(self, semtype_file, remove_duplicates, depparser, verbose)
 
@@ -782,12 +788,12 @@ def demo(show_example=-1):
     from nltk.parse import MaltParser
 
     examples = [
-        "David sees Mary",
-        "David eats a sandwich",
-        "every man chases a dog",
-        "every man believes a dog sleeps",
-        "John gives David a sandwich",
-        "John chases himself",
+        'David sees Mary',
+        'David eats a sandwich',
+        'every man chases a dog',
+        'every man believes a dog sleeps',
+        'John gives David a sandwich',
+        'John chases himself',
     ]
     #                'John persuades David to order a pizza',
     #                'John tries to go',
@@ -799,21 +805,21 @@ def demo(show_example=-1):
     #                'every big gray cat leaves',
     #                'a former senator leaves',
 
-    print("============== DEMO ==============")
+    print('============== DEMO ==============')
 
     tagger = RegexpTagger(
         [
-            ("^(David|Mary|John)$", "NNP"),
+            ('^(David|Mary|John)$', 'NNP'),
             (
-                "^(sees|eats|chases|believes|gives|sleeps|chases|persuades|tries|seems|leaves)$",
-                "VB",
+                '^(sees|eats|chases|believes|gives|sleeps|chases|persuades|tries|seems|leaves)$',
+                'VB',
             ),
-            ("^(go|order|vanish|find|approach)$", "VB"),
-            ("^(a)$", "ex_quant"),
-            ("^(every)$", "univ_quant"),
-            ("^(sandwich|man|dog|pizza|unicorn|cat|senator)$", "NN"),
-            ("^(big|gray|former)$", "JJ"),
-            ("^(him|himself)$", "PRP"),
+            ('^(go|order|vanish|find|approach)$', 'VB'),
+            ('^(a)$', 'ex_quant'),
+            ('^(every)$', 'univ_quant'),
+            ('^(sandwich|man|dog|pizza|unicorn|cat|senator)$', 'NN'),
+            ('^(big|gray|former)$', 'JJ'),
+            ('^(him|himself)$', 'PRP'),
         ]
     )
 
@@ -822,11 +828,11 @@ def demo(show_example=-1):
 
     for (i, sentence) in enumerate(examples):
         if i == show_example or show_example == -1:
-            print("[[[Example %s]]]  %s" % (i, sentence))
+            print('[[[Example %s]]]  %s' % (i, sentence))
             for reading in glue.parse_to_meaning(sentence.split()):
                 print(reading.simplify())
-            print("")
+            print('')
 
 
-if __name__ == "__main__":
+if __name__ == '__main__':
     demo()
index bcd6dbf..32852b8 100644 (file)
@@ -3,7 +3,7 @@
 # Author:     Peter Wang
 # Updated by: Dan Garrette <dhgarrette@gmail.com>
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # URL: <http://nltk.org>
 # For license information, see LICENSE.TXT
 
@@ -19,9 +19,13 @@ After parsing, the semantic representation is in the form of an underspecified
 representation that is not easy to read.  We use a "plugging" algorithm to
 convert that representation into first-order logic formulas.
 """
+from __future__ import print_function, unicode_literals
 
 from functools import reduce
 
+from six import itervalues
+
+from nltk import compat
 from nltk.parse import load_parser
 
 from nltk.sem.skolemize import skolemize
@@ -48,17 +52,17 @@ from nltk.sem.logic import (
 
 
 class Constants(object):
-    ALL = "ALL"
-    EXISTS = "EXISTS"
-    NOT = "NOT"
-    AND = "AND"
-    OR = "OR"
-    IMP = "IMP"
-    IFF = "IFF"
-    PRED = "PRED"
-    LEQ = "LEQ"
-    HOLE = "HOLE"
-    LABEL = "LABEL"
+    ALL = 'ALL'
+    EXISTS = 'EXISTS'
+    NOT = 'NOT'
+    AND = 'AND'
+    OR = 'OR'
+    IMP = 'IMP'
+    IFF = 'IFF'
+    PRED = 'PRED'
+    LEQ = 'LEQ'
+    HOLE = 'HOLE'
+    LABEL = 'LABEL'
 
     MAP = {
         ALL: lambda v, e: AllExpression(v.variable, e),
@@ -139,7 +143,7 @@ class HoleSemantics(object):
 
     def _find_top_nodes(self, node_list):
         top_nodes = node_list.copy()
-        for f in self.fragments.values():
+        for f in itervalues(self.fragments):
             # the label is the first argument of the predicate
             args = f[1]
             for arg in args:
@@ -203,7 +207,7 @@ class HoleSemantics(object):
                 head = [(a, ancestors) for a in args if self.is_node(a)]
                 self._plug_nodes(head + queue[1:], potential_labels, plug_acc, record)
         else:
-            raise Exception("queue empty")
+            raise Exception('queue empty')
 
     def _plug_hole(self, hole, ancestors0, queue, potential_labels0, plug_acc0, record):
         """
@@ -298,6 +302,7 @@ class HoleSemantics(object):
             return node
 
 
+@compat.python_2_unicode_compatible
 class Constraint(object):
     """
     This class represents a constraint of the form (L =< N),
@@ -321,15 +326,15 @@ class Constraint(object):
         return hash(repr(self))
 
     def __repr__(self):
-        return "(%s < %s)" % (self.lhs, self.rhs)
+        return '(%s < %s)' % (self.lhs, self.rhs)
 
 
 def hole_readings(sentence, grammar_filename=None, verbose=False):
     if not grammar_filename:
-        grammar_filename = "grammars/sample_grammars/hole.fcfg"
+        grammar_filename = 'grammars/sample_grammars/hole.fcfg'
 
     if verbose:
-        print("Reading grammar file", grammar_filename)
+        print('Reading grammar file', grammar_filename)
 
     parser = load_parser(grammar_filename)
 
@@ -337,16 +342,16 @@ def hole_readings(sentence, grammar_filename=None, verbose=False):
     tokens = sentence.split()
     trees = list(parser.parse(tokens))
     if verbose:
-        print("Got %d different parses" % len(trees))
+        print('Got %d different parses' % len(trees))
 
     all_readings = []
     for tree in trees:
         # Get the semantic feature from the top of the parse tree.
-        sem = tree.label()["SEM"].simplify()
+        sem = tree.label()['SEM'].simplify()
 
         # Print the raw semantic representation.
         if verbose:
-            print("Raw:       ", sem)
+            print('Raw:       ', sem)
 
         # Skolemize away all quantifiers.  All variables become unique.
         while isinstance(sem, LambdaExpression):
@@ -354,7 +359,7 @@ def hole_readings(sentence, grammar_filename=None, verbose=False):
         skolemized = skolemize(sem)
 
         if verbose:
-            print("Skolemized:", skolemized)
+            print('Skolemized:', skolemized)
 
         # Break the hole semantics representation down into its components
         # i.e. holes, labels, formula fragments and constraints.
@@ -362,14 +367,14 @@ def hole_readings(sentence, grammar_filename=None, verbose=False):
 
         # Maybe show the details of the semantic representation.
         if verbose:
-            print("Holes:       ", hole_sem.holes)
-            print("Labels:      ", hole_sem.labels)
-            print("Constraints: ", hole_sem.constraints)
-            print("Top hole:    ", hole_sem.top_hole)
-            print("Top labels:  ", hole_sem.top_most_labels)
-            print("Fragments:")
+            print('Holes:       ', hole_sem.holes)
+            print('Labels:      ', hole_sem.labels)
+            print('Constraints: ', hole_sem.constraints)
+            print('Top hole:    ', hole_sem.top_hole)
+            print('Top labels:  ', hole_sem.top_most_labels)
+            print('Fragments:')
             for l, f in hole_sem.fragments.items():
-                print("\t%s: %s" % (l, f))
+                print('\t%s: %s' % (l, f))
 
         # Find all the possible ways to plug the formulas together.
         pluggings = hole_sem.pluggings()
@@ -381,7 +386,7 @@ def hole_readings(sentence, grammar_filename=None, verbose=False):
         if verbose:
             for i, r in enumerate(readings):
                 print()
-                print("%d. %s" % (i, r))
+                print('%d. %s' % (i, r))
             print()
 
         all_readings.extend(readings)
@@ -389,9 +394,9 @@ def hole_readings(sentence, grammar_filename=None, verbose=False):
     return all_readings
 
 
-if __name__ == "__main__":
-    for r in hole_readings("a dog barks"):
+if __name__ == '__main__':
+    for r in hole_readings('a dog barks'):
         print(r)
     print()
-    for r in hole_readings("every girl chases a dog"):
+    for r in hole_readings('every girl chases a dog'):
         print(r)
index d4decf0..9b6957e 100644 (file)
@@ -2,15 +2,18 @@
 #
 # Author: Dan Garrette <dhgarrette@gmail.com>
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # URL: <http://nltk.org/>
 # For license information, see LICENSE.TXT
+from __future__ import print_function, division, unicode_literals
 
 from itertools import chain
 
 from nltk.internals import Counter
+from nltk.compat import python_2_unicode_compatible
 
 
+@python_2_unicode_compatible
 class FStructure(dict):
     def safeappend(self, key, item):
         """
@@ -40,15 +43,15 @@ class FStructure(dict):
         depgraph = DependencyGraph()
         nodes = depgraph.nodes
 
-        self._to_depgraph(nodes, 0, "ROOT")
+        self._to_depgraph(nodes, 0, 'ROOT')
 
         # Add all the dependencies for all the nodes
         for address, node in nodes.items():
-            for n2 in (n for n in nodes.values() if n["rel"] != "TOP"):
-                if n2["head"] == address:
-                    relation = n2["rel"]
-                    node["deps"].setdefault(relation, [])
-                    node["deps"][relation].append(n2["address"])
+            for n2 in (n for n in nodes.values() if n['rel'] != 'TOP'):
+                if n2['head'] == address:
+                    relation = n2['rel']
+                    node['deps'].setdefault(relation, [])
+                    node['deps'][relation].append(n2['address'])
 
         depgraph.root = nodes[1]
 
@@ -59,11 +62,11 @@ class FStructure(dict):
 
         nodes[index].update(
             {
-                "address": index,
-                "word": self.pred[0],
-                "tag": self.pred[1],
-                "head": head,
-                "rel": rel,
+                'address': index,
+                'word': self.pred[0],
+                'tag': self.pred[1],
+                'head': head,
+                'rel': rel,
             }
         )
 
@@ -75,11 +78,11 @@ class FStructure(dict):
                     new_index = len(nodes)
                     nodes[new_index].update(
                         {
-                            "address": new_index,
-                            "word": item[0],
-                            "tag": item[1],
-                            "head": index,
-                            "rel": feature,
+                            'address': new_index,
+                            'word': item[0],
+                            'tag': item[1],
+                            'head': index,
+                            'rel': feature,
                         }
                     )
                 elif isinstance(item, list):
@@ -87,7 +90,7 @@ class FStructure(dict):
                         n._to_depgraph(nodes, index, feature)
                 else:
                     raise Exception(
-                        "feature %s is not an FStruct, a list, or a tuple" % feature
+                        'feature %s is not an FStruct, a list, or a tuple' % feature
                     )
 
     @staticmethod
@@ -99,9 +102,9 @@ class FStructure(dict):
         if not label_counter:
             label_counter = Counter()
 
-        if node["rel"].lower() in ["spec", "punct"]:
+        if node['rel'].lower() in ['spec', 'punct']:
             # the value of a 'spec' entry is a word, not an FStructure
-            return (node["word"], node["tag"])
+            return (node['word'], node['tag'])
 
         else:
             fstruct = FStructure()
@@ -110,19 +113,19 @@ class FStructure(dict):
 
             fstruct.parent = parent
 
-            word, tag = node["word"], node["tag"]
-            if tag[:2] == "VB":
-                if tag[2:3] == "D":
-                    fstruct.safeappend("tense", ("PAST", "tense"))
+            word, tag = node['word'], node['tag']
+            if tag[:2] == 'VB':
+                if tag[2:3] == 'D':
+                    fstruct.safeappend('tense', ('PAST', 'tense'))
                 fstruct.pred = (word, tag[:2])
 
             if not fstruct.pred:
                 fstruct.pred = (word, tag)
 
-            children = [depgraph.nodes[idx] for idx in chain(*node["deps"].values())]
+            children = [depgraph.nodes[idx] for idx in chain(*node['deps'].values())]
             for child in children:
                 fstruct.safeappend(
-                    child["rel"],
+                    child['rel'],
                     FStructure._read_depgraph(child, depgraph, label_counter, fstruct),
                 )
 
@@ -137,32 +140,32 @@ class FStructure(dict):
         :type value: int
         """
         letter = [
-            "f",
-            "g",
-            "h",
-            "i",
-            "j",
-            "k",
-            "l",
-            "m",
-            "n",
-            "o",
-            "p",
-            "q",
-            "r",
-            "s",
-            "t",
-            "u",
-            "v",
-            "w",
-            "x",
-            "y",
-            "z",
-            "a",
-            "b",
-            "c",
-            "d",
-            "e",
+            'f',
+            'g',
+            'h',
+            'i',
+            'j',
+            'k',
+            'l',
+            'm',
+            'n',
+            'o',
+            'p',
+            'q',
+            'r',
+            's',
+            't',
+            'u',
+            'v',
+            'w',
+            'x',
+            'y',
+            'z',
+            'a',
+            'b',
+            'c',
+            'd',
+            'e',
         ][value - 1]
         num = int(value) // 26
         if num > 0:
@@ -171,18 +174,18 @@ class FStructure(dict):
             return letter
 
     def __repr__(self):
-        return self.__str__().replace("\n", "")
+        return self.__unicode__().replace('\n', '')
 
     def __str__(self):
         return self.pretty_format()
 
     def pretty_format(self, indent=3):
         try:
-            accum = "%s:[" % self.label
+            accum = '%s:[' % self.label
         except NameError:
-            accum = "["
+            accum = '['
         try:
-            accum += "pred '%s'" % (self.pred[0])
+            accum += 'pred \'%s\'' % (self.pred[0])
         except NameError:
             pass
 
@@ -190,24 +193,24 @@ class FStructure(dict):
             for item in self[feature]:
                 if isinstance(item, FStructure):
                     next_indent = indent + len(feature) + 3 + len(self.label)
-                    accum += "\n%s%s %s" % (
-                        " " * (indent),
+                    accum += '\n%s%s %s' % (
+                        ' ' * (indent),
                         feature,
                         item.pretty_format(next_indent),
                     )
                 elif isinstance(item, tuple):
-                    accum += "\n%s%s '%s'" % (" " * (indent), feature, item[0])
+                    accum += '\n%s%s \'%s\'' % (' ' * (indent), feature, item[0])
                 elif isinstance(item, list):
-                    accum += "\n%s%s {%s}" % (
-                        " " * (indent),
+                    accum += '\n%s%s {%s}' % (
+                        ' ' * (indent),
                         feature,
-                        ("\n%s" % (" " * (indent + len(feature) + 2))).join(item),
+                        ('\n%s' % (' ' * (indent + len(feature) + 2))).join(item),
                     )
                 else:  # ERROR
                     raise Exception(
-                        "feature %s is not an FStruct, a list, or a tuple" % feature
+                        'feature %s is not an FStruct, a list, or a tuple' % feature
                     )
-        return accum + "]"
+        return accum + ']'
 
 
 def demo_read_depgraph():
@@ -254,5 +257,5 @@ dog     NN      3       OBJ
         print(FStructure.read_depgraph(dg))
 
 
-if __name__ == "__main__":
+if __name__ == '__main__':
     demo_read_depgraph()
index abd9d19..2725980 100644 (file)
@@ -2,11 +2,15 @@
 #
 # Author: Dan Garrette <dhgarrette@gmail.com>
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # URL: <http://nltk.org/>
 # For license information, see LICENSE.TXT
+from __future__ import print_function, unicode_literals
+
+from six import string_types
 
 from nltk.internals import Counter
+from nltk.compat import python_2_unicode_compatible
 from nltk.sem.logic import LogicParser, APP
 
 _counter = Counter()
@@ -14,11 +18,11 @@ _counter = Counter()
 
 class Tokens(object):
     # Punctuation
-    OPEN = "("
-    CLOSE = ")"
+    OPEN = '('
+    CLOSE = ')'
 
     # Operations
-    IMP = "-o"
+    IMP = '-o'
 
     PUNCT = [OPEN, CLOSE]
     TOKENS = PUNCT + [IMP]
@@ -71,6 +75,7 @@ class LinearLogicParser(LogicParser):
             return ConstantExpression(name)
 
 
+@python_2_unicode_compatible
 class Expression(object):
 
     _linear_logic_parser = LinearLogicParser()
@@ -86,16 +91,17 @@ class Expression(object):
         return self.applyto(other)
 
     def __repr__(self):
-        return "<%s %s>" % (self.__class__.__name__, self)
+        return '<%s %s>' % (self.__class__.__name__, self)
 
 
+@python_2_unicode_compatible
 class AtomicExpression(Expression):
     def __init__(self, name, dependencies=None):
         """
         :param name: str for the constant name
         :param dependencies: list of int for the indices on which this atom is dependent
         """
-        assert isinstance(name, str)
+        assert isinstance(name, string_types)
         self.name = name
 
         if not dependencies:
@@ -198,6 +204,7 @@ class VariableExpression(AtomicExpression):
             raise UnificationException(self, other, bindings)
 
 
+@python_2_unicode_compatible
 class ImpExpression(Expression):
     def __init__(self, antecedent, consequent):
         """
@@ -257,7 +264,7 @@ class ImpExpression(Expression):
         (c, c_new) = self.consequent.compile_neg(index_counter, glueFormulaFactory)
         fresh_index = index_counter.get()
         c.dependencies.append(fresh_index)
-        new_v = glueFormulaFactory("v%s" % fresh_index, a, set([fresh_index]))
+        new_v = glueFormulaFactory('v%s' % fresh_index, a, set([fresh_index]))
         return (c, a_new + c_new + [new_v])
 
     def initialize_labels(self, fstruct):
@@ -285,10 +292,11 @@ class ImpExpression(Expression):
 
     def __hash__(self):
         return hash(
-            "%s%s%s" % (hash(self.antecedent), Tokens.IMP, hash(self.consequent))
+            '%s%s%s' % (hash(self.antecedent), Tokens.IMP, hash(self.consequent))
         )
 
 
+@python_2_unicode_compatible
 class ApplicationExpression(Expression):
     def __init__(self, function, argument, argument_indices=None):
         """
@@ -313,7 +321,7 @@ class ApplicationExpression(Expression):
             bindings += function_simp.antecedent.unify(argument_simp, bindings)
         except UnificationException as e:
             raise LinearLogicApplicationException(
-                "Cannot apply %s to %s. %s" % (function_simp, argument_simp, e)
+                'Cannot apply %s to %s. %s' % (function_simp, argument_simp, e)
             )
 
         # If you are running it on complied premises, more conditions apply
@@ -321,12 +329,12 @@ class ApplicationExpression(Expression):
             # A.dependencies of (A -o (B -o C)) must be a proper subset of argument_indices
             if not set(function_simp.antecedent.dependencies) < argument_indices:
                 raise LinearLogicApplicationException(
-                    "Dependencies unfulfilled when attempting to apply Linear Logic formula %s to %s"
+                    'Dependencies unfulfilled when attempting to apply Linear Logic formula %s to %s'
                     % (function_simp, argument_simp)
                 )
             if set(function_simp.antecedent.dependencies) == argument_indices:
                 raise LinearLogicApplicationException(
-                    "Dependencies not a proper subset of indices when attempting to apply Linear Logic formula %s to %s"
+                    'Dependencies not a proper subset of indices when attempting to apply Linear Logic formula %s to %s'
                     % (function_simp, argument_simp)
                 )
 
@@ -363,10 +371,11 @@ class ApplicationExpression(Expression):
 
     def __hash__(self):
         return hash(
-            "%s%s%s" % (hash(self.antecedent), Tokens.OPEN, hash(self.consequent))
+            '%s%s%s' % (hash(self.antecedent), Tokens.OPEN, hash(self.consequent))
         )
 
 
+@python_2_unicode_compatible
 class BindingDict(object):
     def __init__(self, bindings=None):
         """
@@ -403,7 +412,7 @@ class BindingDict(object):
             self.d[variable] = binding
         else:
             raise VariableBindingException(
-                "Variable %s already bound to another value" % (variable)
+                'Variable %s already bound to another value' % (variable)
             )
 
     def __getitem__(self, variable):
@@ -437,8 +446,8 @@ class BindingDict(object):
             return combined
         except VariableBindingException:
             raise VariableBindingException(
-                "Attempting to add two contradicting"
-                " VariableBindingsLists: %s, %s" % (self, other)
+                'Attempting to add two contradicting'
+                ' VariableBindingsLists: %s, %s' % (self, other)
             )
 
     def __ne__(self, other):
@@ -450,10 +459,10 @@ class BindingDict(object):
         return self.d == other.d
 
     def __str__(self):
-        return "{" + ", ".join("%s: %s" % (v, self.d[v]) for v in self.d) + "}"
+        return '{' + ', '.join('%s: %s' % (v, self.d[v]) for v in self.d) + '}'
 
     def __repr__(self):
-        return "BindingDict: %s" % self
+        return 'BindingDict: %s' % self
 
 
 class VariableBindingException(Exception):
@@ -462,7 +471,7 @@ class VariableBindingException(Exception):
 
 class UnificationException(Exception):
     def __init__(self, a, b, bindings):
-        Exception.__init__(self, "Cannot unify %s with %s given %s" % (a, b, bindings))
+        Exception.__init__(self, 'Cannot unify %s with %s given %s' % (a, b, bindings))
 
 
 class LinearLogicApplicationException(Exception):
@@ -472,15 +481,15 @@ class LinearLogicApplicationException(Exception):
 def demo():
     lexpr = Expression.fromstring
 
-    print(lexpr(r"f"))
-    print(lexpr(r"(g -o f)"))
-    print(lexpr(r"((g -o G) -o G)"))
-    print(lexpr(r"g -o h -o f"))
-    print(lexpr(r"(g -o f)(g)").simplify())
-    print(lexpr(r"(H -o f)(g)").simplify())
-    print(lexpr(r"((g -o G) -o G)((g -o f))").simplify())
-    print(lexpr(r"(H -o H)((g -o f))").simplify())
+    print(lexpr(r'f'))
+    print(lexpr(r'(g -o f)'))
+    print(lexpr(r'((g -o G) -o G)'))
+    print(lexpr(r'g -o h -o f'))
+    print(lexpr(r'(g -o f)(g)').simplify())
+    print(lexpr(r'(H -o f)(g)').simplify())
+    print(lexpr(r'((g -o G) -o G)((g -o f))').simplify())
+    print(lexpr(r'(H -o H)((g -o f))').simplify())
 
 
-if __name__ == "__main__":
+if __name__ == '__main__':
     demo()
index c203e1f..fe5f73b 100644 (file)
@@ -2,7 +2,7 @@
 #
 # Author: Dan Garrette <dhgarrette@gmail.com>
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # URL: <http://nltk.org>
 # For license information, see LICENSE.TXT
 
 A version of first order predicate logic, built on
 top of the typed lambda calculus.
 """
+from __future__ import print_function, unicode_literals
 
 import re
 import operator
 from collections import defaultdict
 from functools import reduce, total_ordering
 
+from six import string_types
+
 from nltk.util import Trie
 from nltk.internals import Counter
+from nltk.compat import python_2_unicode_compatible
 
-APP = "APP"
+APP = 'APP'
 
 _counter = Counter()
 
 
 class Tokens(object):
-    LAMBDA = "\\"
-    LAMBDA_LIST = ["\\"]
+    LAMBDA = '\\'
+    LAMBDA_LIST = ['\\']
 
     # Quantifiers
-    EXISTS = "exists"
-    EXISTS_LIST = ["some", "exists", "exist"]
-    ALL = "all"
-    ALL_LIST = ["all", "forall"]
+    EXISTS = 'exists'
+    EXISTS_LIST = ['some', 'exists', 'exist']
+    ALL = 'all'
+    ALL_LIST = ['all', 'forall']
 
     # Punctuation
-    DOT = "."
-    OPEN = "("
-    CLOSE = ")"
-    COMMA = ","
+    DOT = '.'
+    OPEN = '('
+    CLOSE = ')'
+    COMMA = ','
 
     # Operations
-    NOT = "-"
-    NOT_LIST = ["not", "-", "!"]
-    AND = "&"
-    AND_LIST = ["and", "&", "^"]
-    OR = "|"
-    OR_LIST = ["or", "|"]
-    IMP = "->"
-    IMP_LIST = ["implies", "->", "=>"]
-    IFF = "<->"
-    IFF_LIST = ["iff", "<->", "<=>"]
-    EQ = "="
-    EQ_LIST = ["=", "=="]
-    NEQ = "!="
-    NEQ_LIST = ["!="]
+    NOT = '-'
+    NOT_LIST = ['not', '-', '!']
+    AND = '&'
+    AND_LIST = ['and', '&', '^']
+    OR = '|'
+    OR_LIST = ['or', '|']
+    IMP = '->'
+    IMP_LIST = ['implies', '->', '=>']
+    IFF = '<->'
+    IFF_LIST = ['iff', '<->', '<=>']
+    EQ = '='
+    EQ_LIST = ['=', '==']
+    NEQ = '!='
+    NEQ_LIST = ['!=']
 
     # Collections of tokens
     BINOPS = AND_LIST + OR_LIST + IMP_LIST + IFF_LIST
@@ -64,7 +68,7 @@ class Tokens(object):
     TOKENS = BINOPS + EQ_LIST + NEQ_LIST + QUANTS + LAMBDA_LIST + PUNCT + NOT_LIST
 
     # Special
-    SYMBOLS = [x for x in TOKENS if re.match(r"^[-\\.(),!&^|>=<]*$", x)]
+    SYMBOLS = [x for x in TOKENS if re.match(r'^[-\\.(),!&^|>=<]*$', x)]
 
 
 def boolean_ops():
@@ -94,6 +98,7 @@ def binding_ops():
         print("%-15s\t%s" % pair)
 
 
+@python_2_unicode_compatible
 class LogicParser(object):
     """A lambda calculus expression parser."""
 
@@ -152,7 +157,7 @@ class LogicParser(object):
             if self.inRange(0):
                 raise UnexpectedTokenException(self._currentIndex + 1, self.token(0))
         except LogicalExpressionException as e:
-            msg = "%s\n%s\n%s^" % (e, data, " " * mapping[e.index - 1])
+            msg = '%s\n%s\n%s^' % (e, data, ' ' * mapping[e.index - 1])
             raise LogicalExpressionException(None, msg)
 
         if self.type_check:
@@ -165,7 +170,7 @@ class LogicParser(object):
         out = []
         mapping = {}
         tokenTrie = Trie(self.get_all_symbols())
-        token = ""
+        token = ''
         data_idx = 0
         token_start_idx = data_idx
         while data_idx < len(data):
@@ -179,7 +184,7 @@ class LogicParser(object):
 
             st = tokenTrie
             c = data[data_idx]
-            symbol = ""
+            symbol = ''
             while c in st:
                 symbol += c
                 st = st[c]
@@ -192,16 +197,16 @@ class LogicParser(object):
                 if token:
                     mapping[len(out)] = token_start_idx
                     out.append(token)
-                    token = ""
+                    token = ''
                 mapping[len(out)] = data_idx
                 out.append(symbol)
                 data_idx += len(symbol)
             else:
-                if data[data_idx] in " \t\n":  # any whitespace
+                if data[data_idx] in ' \t\n':  # any whitespace
                     if token:
                         mapping[len(out)] = token_start_idx
                         out.append(token)
-                        token = ""
+                        token = ''
                 else:
                     if not token:
                         token_start_idx = data_idx
@@ -215,7 +220,7 @@ class LogicParser(object):
         return out, mapping
 
     def process_quoted_token(self, data_idx, data):
-        token = ""
+        token = ''
         c = data[data_idx]
         i = data_idx
         for start, end, escape, incl_quotes in self.quote_chars:
@@ -246,7 +251,7 @@ class LogicParser(object):
                     token += data[i]
                 i += 1
                 if not token:
-                    raise LogicalExpressionException(None, "Empty quoted token found")
+                    raise LogicalExpressionException(None, 'Empty quoted token found')
                 break
         return token, i
 
@@ -281,14 +286,14 @@ class LogicParser(object):
             tok = self.token()
         except ExpectedMoreTokensException:
             raise ExpectedMoreTokensException(
-                self._currentIndex + 1, message="Expression expected."
+                self._currentIndex + 1, message='Expression expected.'
             )
 
         accum = self.handle(tok, context)
 
         if not accum:
             raise UnexpectedTokenException(
-                self._currentIndex, tok, message="Expression expected."
+                self._currentIndex, tok, message='Expression expected.'
             )
 
         return self.attempt_adjuncts(accum, context)
@@ -360,7 +365,7 @@ class LogicParser(object):
         try:
             tok = self.token()
         except ExpectedMoreTokensException as e:
-            raise ExpectedMoreTokensException(e.index, "Variable expected.")
+            raise ExpectedMoreTokensException(e.index, 'Variable expected.')
         if isinstance(self.make_VariableExpression(tok), ConstantExpression):
             raise LogicalExpressionException(
                 self._currentIndex,
@@ -376,7 +381,7 @@ class LogicParser(object):
                 self._currentIndex + 2,
                 message="Variable and Expression expected following lambda operator.",
             )
-        vars = [self.get_next_token_variable("abstracted")]
+        vars = [self.get_next_token_variable('abstracted')]
         while True:
             if not self.inRange(0) or (
                 self.token(0) == Tokens.DOT and not self.inRange(1)
@@ -387,7 +392,7 @@ class LogicParser(object):
             if not self.isvariable(self.token(0)):
                 break
             # Support expressions like: \x y.M == \x.\y.M
-            vars.append(self.get_next_token_variable("abstracted"))
+            vars.append(self.get_next_token_variable('abstracted'))
         if self.inRange(0) and self.token(0) == Tokens.DOT:
             self.token()  # swallow the dot
 
@@ -406,7 +411,7 @@ class LogicParser(object):
                 message="Variable and Expression expected following quantifier '%s'."
                 % tok,
             )
-        vars = [self.get_next_token_variable("quantified")]
+        vars = [self.get_next_token_variable('quantified')]
         while True:
             if not self.inRange(0) or (
                 self.token(0) == Tokens.DOT and not self.inRange(1)
@@ -417,7 +422,7 @@ class LogicParser(object):
             if not self.isvariable(self.token(0)):
                 break
             # Support expressions like: some x y.M == some x.some y.M
-            vars.append(self.get_next_token_variable("quantified"))
+            vars.append(self.get_next_token_variable('quantified'))
         if self.inRange(0) and self.token(0) == Tokens.DOT:
             self.token()  # swallow the dot
 
@@ -577,10 +582,10 @@ class LogicParser(object):
 
     def __repr__(self):
         if self.inRange(0):
-            msg = "Next token: " + self.token(0)
+            msg = 'Next token: ' + self.token(0)
         else:
-            msg = "No more tokens"
-        return "<" + self.__class__.__name__ + ": " + msg + ">"
+            msg = 'No more tokens'
+        return '<' + self.__class__.__name__ + ': ' + msg + '>'
 
 
 def read_logic(s, logic_parser=None, encoding=None):
@@ -604,22 +609,23 @@ def read_logic(s, logic_parser=None, encoding=None):
     statements = []
     for linenum, line in enumerate(s.splitlines()):
         line = line.strip()
-        if line.startswith("#") or line == "":
+        if line.startswith('#') or line == '':
             continue
         try:
             statements.append(logic_parser.parse(line))
         except LogicalExpressionException:
-            raise ValueError("Unable to parse line %s: %s" % (linenum, line))
+            raise ValueError('Unable to parse line %s: %s' % (linenum, line))
     return statements
 
 
 @total_ordering
+@python_2_unicode_compatible
 class Variable(object):
     def __init__(self, name):
         """
         :param name: the name of the variable
         """
-        assert isinstance(name, str), "%s is not a string" % name
+        assert isinstance(name, string_types), "%s is not a string" % name
         self.name = name
 
     def __eq__(self, other):
@@ -658,15 +664,15 @@ def unique_variable(pattern=None, ignore=None):
     """
     if pattern is not None:
         if is_indvar(pattern.name):
-            prefix = "z"
+            prefix = 'z'
         elif is_funcvar(pattern.name):
-            prefix = "F"
+            prefix = 'F'
         elif is_eventvar(pattern.name):
-            prefix = "e0"
+            prefix = 'e0'
         else:
             assert False, "Cannot generate a unique constant"
     else:
-        prefix = "z"
+        prefix = 'z'
 
     v = Variable("%s%s" % (prefix, _counter.get()))
     while ignore is not None and v in ignore:
@@ -679,13 +685,14 @@ def skolem_function(univ_scope=None):
     Return a skolem function over the variables in univ_scope
     param univ_scope
     """
-    skolem = VariableExpression(Variable("F%s" % _counter.get()))
+    skolem = VariableExpression(Variable('F%s' % _counter.get()))
     if univ_scope:
         for v in list(univ_scope):
             skolem = skolem(VariableExpression(v))
     return skolem
 
 
+@python_2_unicode_compatible
 class Type(object):
     def __repr__(self):
         return "%s" % self
@@ -698,6 +705,7 @@ class Type(object):
         return read_type(s)
 
 
+@python_2_unicode_compatible
 class ComplexType(Type):
     def __init__(self, first, second):
         assert isinstance(first, Type), "%s is not a Type" % first
@@ -742,13 +750,13 @@ class ComplexType(Type):
         if self == ANY_TYPE:
             return "%s" % ANY_TYPE
         else:
-            return "<%s,%s>" % (self.first, self.second)
+            return '<%s,%s>' % (self.first, self.second)
 
     def str(self):
         if self == ANY_TYPE:
             return ANY_TYPE.str()
         else:
-            return "(%s -> %s)" % (self.first.str(), self.second.str())
+            return '(%s -> %s)' % (self.first.str(), self.second.str())
 
 
 class BasicType(Type):
@@ -770,30 +778,34 @@ class BasicType(Type):
             return None
 
 
+@python_2_unicode_compatible
 class EntityType(BasicType):
     def __str__(self):
-        return "e"
+        return 'e'
 
     def str(self):
-        return "IND"
+        return 'IND'
 
 
+@python_2_unicode_compatible
 class TruthValueType(BasicType):
     def __str__(self):
-        return "t"
+        return 't'
 
     def str(self):
-        return "BOOL"
+        return 'BOOL'
 
 
+@python_2_unicode_compatible
 class EventType(BasicType):
     def __str__(self):
-        return "v"
+        return 'v'
 
     def str(self):
-        return "EVENT"
+        return 'EVENT'
 
 
+@python_2_unicode_compatible
 class AnyType(BasicType, ComplexType):
     def __init__(self):
         pass
@@ -821,10 +833,10 @@ class AnyType(BasicType, ComplexType):
         return other
 
     def __str__(self):
-        return "?"
+        return '?'
 
     def str(self):
-        return "ANY"
+        return 'ANY'
 
 
 TRUTH_TYPE = TruthValueType()
@@ -834,19 +846,19 @@ ANY_TYPE = AnyType()
 
 
 def read_type(type_string):
-    assert isinstance(type_string, str)
-    type_string = type_string.replace(" ", "")  # remove spaces
+    assert isinstance(type_string, string_types)
+    type_string = type_string.replace(' ', '')  # remove spaces
 
-    if type_string[0] == "<":
-        assert type_string[-1] == ">"
+    if type_string[0] == '<':
+        assert type_string[-1] == '>'
         paren_count = 0
         for i, char in enumerate(type_string):
-            if char == "<":
+            if char == '<':
                 paren_count += 1
-            elif char == ">":
+            elif char == '>':
                 paren_count -= 1
                 assert paren_count > 0
-            elif char == ",":
+            elif char == ',':
                 if paren_count == 1:
                     break
         return ComplexType(
@@ -859,9 +871,7 @@ def read_type(type_string):
     elif type_string[0] == "%s" % ANY_TYPE:
         return ANY_TYPE
     else:
-        raise LogicalExpressionException(
-            None, "Unexpected character: '%s'." % type_string[0]
-        )
+        raise LogicalExpressionException(None, "Unexpected character: '%s'." % type_string[0])
 
 
 class TypeException(Exception):
@@ -938,6 +948,7 @@ class SubstituteBindingsI(object):
         raise NotImplementedError()
 
 
+@python_2_unicode_compatible
 class Expression(SubstituteBindingsI):
     """This is the base abstract object for all logical expressions"""
 
@@ -1025,8 +1036,8 @@ class Expression(SubstituteBindingsI):
                     val = self.make_VariableExpression(val)
                 elif not isinstance(val, Expression):
                     raise ValueError(
-                        "Can not substitute a non-expression "
-                        "value into an expression: %r" % (val,)
+                        'Can not substitute a non-expression '
+                        'value into an expression: %r' % (val,)
                     )
                 # Substitute bindings in the target value.
                 val = val.substitute_bindings(bindings)
@@ -1110,9 +1121,9 @@ class Expression(SubstituteBindingsI):
         result = self
         for i, e in enumerate(sorted(get_indiv_vars(self), key=lambda e: e.variable)):
             if isinstance(e, EventVariableExpression):
-                newVar = e.__class__(Variable("e0%s" % (i + 1)))
+                newVar = e.__class__(Variable('e0%s' % (i + 1)))
             elif isinstance(e, IndividualVariableExpression):
-                newVar = e.__class__(Variable("z%s" % (i + 1)))
+                newVar = e.__class__(Variable('z%s' % (i + 1)))
             else:
                 newVar = e
             result = result.replace(e.variable, newVar, True)
@@ -1151,7 +1162,7 @@ class Expression(SubstituteBindingsI):
         return self.visit(function, lambda parts: combinator(*parts))
 
     def __repr__(self):
-        return "<%s %s>" % (self.__class__.__name__, self)
+        return '<%s %s>' % (self.__class__.__name__, self)
 
     def __str__(self):
         return self.str()
@@ -1164,7 +1175,7 @@ class Expression(SubstituteBindingsI):
         :return: set of ``Variable`` objects
         """
         return self.free() | set(
-            p for p in self.predicates() | self.constants() if re.match("^[?@]", p.name)
+            p for p in self.predicates() | self.constants() if re.match('^[?@]', p.name)
         )
 
     def free(self):
@@ -1205,6 +1216,7 @@ class Expression(SubstituteBindingsI):
         return VariableExpression(variable)
 
 
+@python_2_unicode_compatible
 class ApplicationExpression(Expression):
     r"""
     This class is used to represent two related types of logical expressions.
@@ -1347,7 +1359,7 @@ class ApplicationExpression(Expression):
         # uncurry the arguments and find the base function
         if self.is_atom():
             function, args = self.uncurry()
-            arg_str = ",".join("%s" % arg for arg in args)
+            arg_str = ','.join("%s" % arg for arg in args)
         else:
             # Leave arguments curried
             function = self.function
@@ -1408,6 +1420,7 @@ class ApplicationExpression(Expression):
 
 
 @total_ordering
+@python_2_unicode_compatible
 class AbstractVariableExpression(Expression):
     """This class represents a variable to be used as a predicate or entity"""
 
@@ -1683,6 +1696,7 @@ class VariableBinderExpression(Expression):
     __hash__ = Expression.__hash__
 
 
+@python_2_unicode_compatible
 class LambdaExpression(VariableBinderExpression):
     @property
     def type(self):
@@ -1707,12 +1721,13 @@ class LambdaExpression(VariableBinderExpression):
             term = term.term
         return (
             Tokens.LAMBDA
-            + " ".join("%s" % v for v in variables)
+            + ' '.join("%s" % v for v in variables)
             + Tokens.DOT
             + "%s" % term
         )
 
 
+@python_2_unicode_compatible
 class QuantifiedExpression(VariableBinderExpression):
     @property
     def type(self):
@@ -1737,8 +1752,8 @@ class QuantifiedExpression(VariableBinderExpression):
             term = term.term
         return (
             self.getQuantifier()
-            + " "
-            + " ".join("%s" % v for v in variables)
+            + ' '
+            + ' '.join("%s" % v for v in variables)
             + Tokens.DOT
             + "%s" % term
         )
@@ -1754,6 +1769,7 @@ class AllExpression(QuantifiedExpression):
         return Tokens.ALL
 
 
+@python_2_unicode_compatible
 class NegatedExpression(Expression):
     def __init__(self, term):
         assert isinstance(term, Expression), "%s is not an Expression" % term
@@ -1798,6 +1814,7 @@ class NegatedExpression(Expression):
         return Tokens.NOT + "%s" % self.term
 
 
+@python_2_unicode_compatible
 class BinaryExpression(Expression):
     def __init__(self, first, second):
         assert isinstance(first, Expression), "%s is not an Expression" % first
@@ -1840,7 +1857,7 @@ class BinaryExpression(Expression):
     def __str__(self):
         first = self._str_subex(self.first)
         second = self._str_subex(self.second)
-        return Tokens.OPEN + first + " " + self.getOp() + " " + second + Tokens.CLOSE
+        return Tokens.OPEN + first + ' ' + self.getOp() + ' ' + second + Tokens.CLOSE
 
     def _str_subex(self, subex):
         return "%s" % subex
@@ -1938,7 +1955,7 @@ class UnexpectedTokenException(LogicalExpressionException):
         elif unexpected:
             msg = "Unexpected token: '%s'." % unexpected
             if message:
-                msg += "  " + message
+                msg += '  ' + message
         else:
             msg = "Expected token '%s'." % expected
         LogicalExpressionException.__init__(self, index, msg)
@@ -1947,9 +1964,9 @@ class UnexpectedTokenException(LogicalExpressionException):
 class ExpectedMoreTokensException(LogicalExpressionException):
     def __init__(self, index, message=None):
         if not message:
-            message = "More tokens expected."
+            message = 'More tokens expected.'
         LogicalExpressionException.__init__(
-            self, index, "End of input found.  " + message
+            self, index, 'End of input found.  ' + message
         )
 
 
@@ -1961,8 +1978,8 @@ def is_indvar(expr):
     :param expr: str
     :return: bool True if expr is of the correct form
     """
-    assert isinstance(expr, str), "%s is not a string" % expr
-    return re.match(r"^[a-df-z]\d*$", expr) is not None
+    assert isinstance(expr, string_types), "%s is not a string" % expr
+    return re.match(r'^[a-df-z]\d*$', expr) is not None
 
 
 def is_funcvar(expr):
@@ -1973,8 +1990,8 @@ def is_funcvar(expr):
     :param expr: str
     :return: bool True if expr is of the correct form
     """
-    assert isinstance(expr, str), "%s is not a string" % expr
-    return re.match(r"^[A-Z]\d*$", expr) is not None
+    assert isinstance(expr, string_types), "%s is not a string" % expr
+    return re.match(r'^[A-Z]\d*$', expr) is not None
 
 
 def is_eventvar(expr):
@@ -1985,58 +2002,58 @@ def is_eventvar(expr):
     :param expr: str
     :return: bool True if expr is of the correct form
     """
-    assert isinstance(expr, str), "%s is not a string" % expr
-    return re.match(r"^e\d*$", expr) is not None
+    assert isinstance(expr, string_types), "%s is not a string" % expr
+    return re.match(r'^e\d*$', expr) is not None
 
 
 def demo():
     lexpr = Expression.fromstring
-    print("=" * 20 + "Test reader" + "=" * 20)
-    print(lexpr(r"john"))
-    print(lexpr(r"man(x)"))
-    print(lexpr(r"-man(x)"))
-    print(lexpr(r"(man(x) & tall(x) & walks(x))"))
-    print(lexpr(r"exists x.(man(x) & tall(x) & walks(x))"))
-    print(lexpr(r"\x.man(x)"))
-    print(lexpr(r"\x.man(x)(john)"))
-    print(lexpr(r"\x y.sees(x,y)"))
-    print(lexpr(r"\x y.sees(x,y)(a,b)"))
-    print(lexpr(r"(\x.exists y.walks(x,y))(x)"))
-    print(lexpr(r"exists x.x = y"))
-    print(lexpr(r"exists x.(x = y)"))
-    print(lexpr("P(x) & x=y & P(y)"))
-    print(lexpr(r"\P Q.exists x.(P(x) & Q(x))"))
-    print(lexpr(r"man(x) <-> tall(x)"))
-
-    print("=" * 20 + "Test simplify" + "=" * 20)
-    print(lexpr(r"\x.\y.sees(x,y)(john)(mary)").simplify())
-    print(lexpr(r"\x.\y.sees(x,y)(john, mary)").simplify())
-    print(lexpr(r"all x.(man(x) & (\x.exists y.walks(x,y))(x))").simplify())
-    print(lexpr(r"(\P.\Q.exists x.(P(x) & Q(x)))(\x.dog(x))(\x.bark(x))").simplify())
-
-    print("=" * 20 + "Test alpha conversion and binder expression equality" + "=" * 20)
-    e1 = lexpr("exists x.P(x)")
+    print('=' * 20 + 'Test reader' + '=' * 20)
+    print(lexpr(r'john'))
+    print(lexpr(r'man(x)'))
+    print(lexpr(r'-man(x)'))
+    print(lexpr(r'(man(x) & tall(x) & walks(x))'))
+    print(lexpr(r'exists x.(man(x) & tall(x) & walks(x))'))
+    print(lexpr(r'\x.man(x)'))
+    print(lexpr(r'\x.man(x)(john)'))
+    print(lexpr(r'\x y.sees(x,y)'))
+    print(lexpr(r'\x y.sees(x,y)(a,b)'))
+    print(lexpr(r'(\x.exists y.walks(x,y))(x)'))
+    print(lexpr(r'exists x.x = y'))
+    print(lexpr(r'exists x.(x = y)'))
+    print(lexpr('P(x) & x=y & P(y)'))
+    print(lexpr(r'\P Q.exists x.(P(x) & Q(x))'))
+    print(lexpr(r'man(x) <-> tall(x)'))
+
+    print('=' * 20 + 'Test simplify' + '=' * 20)
+    print(lexpr(r'\x.\y.sees(x,y)(john)(mary)').simplify())
+    print(lexpr(r'\x.\y.sees(x,y)(john, mary)').simplify())
+    print(lexpr(r'all x.(man(x) & (\x.exists y.walks(x,y))(x))').simplify())
+    print(lexpr(r'(\P.\Q.exists x.(P(x) & Q(x)))(\x.dog(x))(\x.bark(x))').simplify())
+
+    print('=' * 20 + 'Test alpha conversion and binder expression equality' + '=' * 20)
+    e1 = lexpr('exists x.P(x)')
     print(e1)
-    e2 = e1.alpha_convert(Variable("z"))
+    e2 = e1.alpha_convert(Variable('z'))
     print(e2)
     print(e1 == e2)
 
 
 def demo_errors():
-    print("=" * 20 + "Test reader errors" + "=" * 20)
-    demoException("(P(x) & Q(x)")
-    demoException("((P(x) &) & Q(x))")
-    demoException("P(x) -> ")
-    demoException("P(x")
-    demoException("P(x,")
-    demoException("P(x,)")
-    demoException("exists")
-    demoException("exists x.")
-    demoException("\\")
-    demoException("\\ x y.")
-    demoException("P(x)Q(x)")
-    demoException("(P(x)Q(x)")
-    demoException("exists x -> y")
+    print('=' * 20 + 'Test reader errors' + '=' * 20)
+    demoException('(P(x) & Q(x)')
+    demoException('((P(x) &) & Q(x))')
+    demoException('P(x) -> ')
+    demoException('P(x')
+    demoException('P(x,')
+    demoException('P(x,)')
+    demoException('exists')
+    demoException('exists x.')
+    demoException('\\')
+    demoException('\\ x y.')
+    demoException('P(x)Q(x)')
+    demoException('(P(x)Q(x)')
+    demoException('exists x -> y')
 
 
 def demoException(s):
@@ -2050,6 +2067,6 @@ def printtype(ex):
     print("%s : %s" % (ex.str(), ex.type))
 
 
-if __name__ == "__main__":
+if __name__ == '__main__':
     demo()
 #    demo_errors()
index 1d1ec76..5837f84 100644 (file)
@@ -1,6 +1,6 @@
 # Natural Language Toolkit: Relation Extraction
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Ewan Klein <ewan@inf.ed.ac.uk>
 # URL: <http://nltk.org/>
 # For license information, see LICENSE.TXT
@@ -19,45 +19,47 @@ The two serialization outputs are "rtuple" and "clause".
 - A clause is an atom of the form ``relsym(subjsym, objsym)``,
   where the relation, subject and object have been canonicalized to single strings.
 """
+from __future__ import print_function
 
 # todo: get a more general solution to canonicalized symbols for clauses -- maybe use xmlcharrefs?
 
 from collections import defaultdict
-import html
 import re
 
+from six.moves import html_entities
+
 # Dictionary that associates corpora with NE classes
 NE_CLASSES = {
-    "ieer": [
-        "LOCATION",
-        "ORGANIZATION",
-        "PERSON",
-        "DURATION",
-        "DATE",
-        "CARDINAL",
-        "PERCENT",
-        "MONEY",
-        "MEASURE",
+    'ieer': [
+        'LOCATION',
+        'ORGANIZATION',
+        'PERSON',
+        'DURATION',
+        'DATE',
+        'CARDINAL',
+        'PERCENT',
+        'MONEY',
+        'MEASURE',
     ],
-    "conll2002": ["LOC", "PER", "ORG"],
-    "ace": [
-        "LOCATION",
-        "ORGANIZATION",
-        "PERSON",
-        "DURATION",
-        "DATE",
-        "CARDINAL",
-        "PERCENT",
-        "MONEY",
-        "MEASURE",
-        "FACILITY",
-        "GPE",
+    'conll2002': ['LOC', 'PER', 'ORG'],
+    'ace': [
+        'LOCATION',
+        'ORGANIZATION',
+        'PERSON',
+        'DURATION',
+        'DATE',
+        'CARDINAL',
+        'PERCENT',
+        'MONEY',
+        'MEASURE',
+        'FACILITY',
+        'GPE',
     ],
 }
 
 # Allow abbreviated class labels
-short2long = dict(LOC="LOCATION", ORG="ORGANIZATION", PER="PERSON")
-long2short = dict(LOCATION="LOC", ORGANIZATION="ORG", PERSON="PER")
+short2long = dict(LOC='LOCATION', ORG='ORGANIZATION', PER='PERSON')
+long2short = dict(LOCATION='LOC', ORGANIZATION='ORG', PERSON='PER')
 
 
 def _expand(type):
@@ -84,7 +86,7 @@ def class_abbrev(type):
         return type
 
 
-def _join(lst, sep=" ", untag=False):
+def _join(lst, sep=' ', untag=False):
     """
     Join a list into a string, turning tags tuples into tag strings or just words.
     :param untag: if ``True``, omit the tag from tagged input strings.
@@ -101,13 +103,19 @@ def _join(lst, sep=" ", untag=False):
         return sep.join(tuple2str(tup) for tup in lst)
 
 
-def descape_entity(m, defs=html.entities.entitydefs):
+def descape_entity(m, defs=html_entities.entitydefs):
     """
     Translate one entity to its ISO Latin value.
     Inspired by example from effbot.org
 
 
     """
+    # s = 'mcglashan_&amp;_sarrail'
+    # l = ['mcglashan', '&amp;', 'sarrail']
+    # pattern = re.compile("&(\w+?);")
+    # new = list2sym(l)
+    # s = pattern.sub(descape_entity, s)
+    # print s, new
     try:
         return defs[m.group(1)]
 
@@ -122,11 +130,11 @@ def list2sym(lst):
     :return: a Unicode string without whitespace
     :rtype: unicode
     """
-    sym = _join(lst, "_", untag=True)
+    sym = _join(lst, '_', untag=True)
     sym = sym.lower()
     ENT = re.compile("&(\w+?);")
     sym = ENT.sub(descape_entity, sym)
-    sym = sym.replace(".", "")
+    sym = sym.replace('.', '')
     return sym
 
 
@@ -175,23 +183,23 @@ def semi_rel2reldict(pairs, window=5, trace=False):
     result = []
     while len(pairs) > 2:
         reldict = defaultdict(str)
-        reldict["lcon"] = _join(pairs[0][0][-window:])
-        reldict["subjclass"] = pairs[0][1].label()
-        reldict["subjtext"] = _join(pairs[0][1].leaves())
-        reldict["subjsym"] = list2sym(pairs[0][1].leaves())
-        reldict["filler"] = _join(pairs[1][0])
-        reldict["untagged_filler"] = _join(pairs[1][0], untag=True)
-        reldict["objclass"] = pairs[1][1].label()
-        reldict["objtext"] = _join(pairs[1][1].leaves())
-        reldict["objsym"] = list2sym(pairs[1][1].leaves())
-        reldict["rcon"] = _join(pairs[2][0][:window])
+        reldict['lcon'] = _join(pairs[0][0][-window:])
+        reldict['subjclass'] = pairs[0][1].label()
+        reldict['subjtext'] = _join(pairs[0][1].leaves())
+        reldict['subjsym'] = list2sym(pairs[0][1].leaves())
+        reldict['filler'] = _join(pairs[1][0])
+        reldict['untagged_filler'] = _join(pairs[1][0], untag=True)
+        reldict['objclass'] = pairs[1][1].label()
+        reldict['objtext'] = _join(pairs[1][1].leaves())
+        reldict['objsym'] = list2sym(pairs[1][1].leaves())
+        reldict['rcon'] = _join(pairs[2][0][:window])
         if trace:
             print(
                 "(%s(%s, %s)"
                 % (
-                    reldict["untagged_filler"],
-                    reldict["subjclass"],
-                    reldict["objclass"],
+                    reldict['untagged_filler'],
+                    reldict['subjclass'],
+                    reldict['objclass'],
                 )
             )
         result.append(reldict)
@@ -199,7 +207,7 @@ def semi_rel2reldict(pairs, window=5, trace=False):
     return result
 
 
-def extract_rels(subjclass, objclass, doc, corpus="ace", pattern=None, window=10):
+def extract_rels(subjclass, objclass, doc, corpus='ace', pattern=None, window=10):
     """
     Filter the output of ``semi_rel2reldict`` according to specified NE classes and a filler pattern.
 
@@ -241,9 +249,9 @@ def extract_rels(subjclass, objclass, doc, corpus="ace", pattern=None, window=10
                 "your value for the object type has not been recognized: %s" % objclass
             )
 
-    if corpus == "ace" or corpus == "conll2002":
+    if corpus == 'ace' or corpus == 'conll2002':
         pairs = tree2semi_rel(doc)
-    elif corpus == "ieer":
+    elif corpus == 'ieer':
         pairs = tree2semi_rel(doc.text) + tree2semi_rel(doc.headline)
     else:
         raise ValueError("corpus type not recognized")
@@ -251,10 +259,10 @@ def extract_rels(subjclass, objclass, doc, corpus="ace", pattern=None, window=10
     reldicts = semi_rel2reldict(pairs)
 
     relfilter = lambda x: (
-        x["subjclass"] == subjclass
-        and len(x["filler"].split()) <= window
-        and pattern.match(x["filler"])
-        and x["objclass"] == objclass
+        x['subjclass'] == subjclass
+        and len(x['filler'].split()) <= window
+        and pattern.match(x['filler'])
+        and x['objclass'] == objclass
     )
 
     return list(filter(relfilter, reldicts))
@@ -267,19 +275,19 @@ def rtuple(reldict, lcon=False, rcon=False):
     :type reldict: defaultdict
     """
     items = [
-        class_abbrev(reldict["subjclass"]),
-        reldict["subjtext"],
-        reldict["filler"],
-        class_abbrev(reldict["objclass"]),
-        reldict["objtext"],
+        class_abbrev(reldict['subjclass']),
+        reldict['subjtext'],
+        reldict['filler'],
+        class_abbrev(reldict['objclass']),
+        reldict['objtext'],
     ]
-    format = "[%s: %r] %r [%s: %r]"
+    format = '[%s: %r] %r [%s: %r]'
     if lcon:
-        items = [reldict["lcon"]] + items
-        format = "...%r)" + format
+        items = [reldict['lcon']] + items
+        format = '...%r)' + format
     if rcon:
-        items.append(reldict["rcon"])
-        format = format + "(%r..."
+        items.append(reldict['rcon'])
+        format = format + '(%r...'
     printargs = tuple(items)
     return format % printargs
 
@@ -292,7 +300,7 @@ def clause(reldict, relsym):
     :param relsym: a label for the relation
     :type relsym: str
     """
-    items = (relsym, reldict["subjsym"], reldict["objsym"])
+    items = (relsym, reldict['subjsym'], reldict['objsym'])
     return "%s(%r, %r)" % items
 
 
@@ -330,7 +338,7 @@ def in_demo(trace=0, sql=True):
 
             warnings.warn("Cannot import sqlite; sql flag will be ignored.")
 
-    IN = re.compile(r".*\bin\b(?!\b.+ing)")
+    IN = re.compile(r'.*\bin\b(?!\b.+ing)')
 
     print()
     print("IEER: in(ORG, LOC) -- just the clauses:")
@@ -341,11 +349,11 @@ def in_demo(trace=0, sql=True):
             if trace:
                 print(doc.docno)
                 print("=" * 15)
-            for rel in extract_rels("ORG", "LOC", doc, corpus="ieer", pattern=IN):
-                print(clause(rel, relsym="IN"))
+            for rel in extract_rels('ORG', 'LOC', doc, corpus='ieer', pattern=IN):
+                print(clause(rel, relsym='IN'))
                 if sql:
                     try:
-                        rtuple = (rel["subjtext"], rel["objtext"], doc.docno)
+                        rtuple = (rel['subjtext'], rel['objtext'], doc.docno)
                         cur.execute(
                             """insert into Locations
                                     values (?, ?, ?)""",
@@ -417,7 +425,7 @@ def roles_demo(trace=0):
                 print(doc.docno)
                 print("=" * 15)
                 lcon = rcon = True
-            for rel in extract_rels("PER", "ORG", doc, corpus="ieer", pattern=ROLES):
+            for rel in extract_rels('PER', 'ORG', doc, corpus='ieer', pattern=ROLES):
                 print(rtuple(rel, lcon=lcon, rcon=rcon))
 
 
@@ -473,12 +481,12 @@ def conllned(trace=1):
     print("Dutch CoNLL2002: van(PER, ORG) -- raw rtuples with context:")
     print("=" * 45)
 
-    for doc in conll2002.chunked_sents("ned.train"):
+    for doc in conll2002.chunked_sents('ned.train'):
         lcon = rcon = False
         if trace:
             lcon = rcon = True
         for rel in extract_rels(
-            "PER", "ORG", doc, corpus="conll2002", pattern=VAN, window=10
+            'PER', 'ORG', doc, corpus='conll2002', pattern=VAN, window=10
         ):
             print(rtuple(rel, lcon=lcon, rcon=rcon))
 
@@ -505,11 +513,11 @@ def conllesp():
     print("=" * 45)
     rels = [
         rel
-        for doc in conll2002.chunked_sents("esp.train")
-        for rel in extract_rels("ORG", "LOC", doc, corpus="conll2002", pattern=DE)
+        for doc in conll2002.chunked_sents('esp.train')
+        for rel in extract_rels('ORG', 'LOC', doc, corpus='conll2002', pattern=DE)
     ]
     for r in rels[:10]:
-        print(clause(r, relsym="DE"))
+        print(clause(r, relsym='DE'))
     print()
 
 
@@ -518,17 +526,17 @@ def ne_chunked():
     print("1500 Sentences from Penn Treebank, as processed by NLTK NE Chunker")
     print("=" * 45)
     ROLE = re.compile(
-        r".*(chairman|president|trader|scientist|economist|analyst|partner).*"
+        r'.*(chairman|president|trader|scientist|economist|analyst|partner).*'
     )
     rels = []
     for i, sent in enumerate(nltk.corpus.treebank.tagged_sents()[:1500]):
         sent = nltk.ne_chunk(sent)
-        rels = extract_rels("PER", "ORG", sent, corpus="ace", pattern=ROLE, window=7)
+        rels = extract_rels('PER', 'ORG', sent, corpus='ace', pattern=ROLE, window=7)
         for rel in rels:
-            print("{0:<5}{1}".format(i, rtuple(rel)))
+            print('{0:<5}{1}'.format(i, rtuple(rel)))
 
 
-if __name__ == "__main__":
+if __name__ == '__main__':
     import nltk
     from nltk.sem import relextract
 
index 3070480..73eaf4b 100644 (file)
@@ -2,7 +2,7 @@
 #
 # Author: Ewan Klein <ewan@inf.ed.ac.uk>
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # URL: <http://nltk.org/>
 # For license information, see LICENSE.TXT
 
@@ -116,7 +116,7 @@ def skolemize(expression, univ_scope=None, used_variables=None):
         elif isinstance(negated, ApplicationExpression):
             return expression
         else:
-            raise Exception("'%s' cannot be skolemized" % expression)
+            raise Exception('\'%s\' cannot be skolemized' % expression)
     elif isinstance(expression, ExistsExpression):
         term = skolemize(
             expression.term, univ_scope, used_variables | set([expression.variable])
@@ -129,7 +129,7 @@ def skolemize(expression, univ_scope=None, used_variables=None):
     elif isinstance(expression, ApplicationExpression):
         return expression
     else:
-        raise Exception("'%s' cannot be skolemized" % expression)
+        raise Exception('\'%s\' cannot be skolemized' % expression)
 
 
 def to_cnf(first, second):
index a36442b..bac3884 100644 (file)
@@ -2,7 +2,7 @@
 #
 # Author: Ewan Klein <ewan@inf.ed.ac.uk>
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # URL: <http://nltk.org/>
 # For license information, see LICENSE.TXT
 
@@ -12,6 +12,7 @@ extraction of the semantic representation of the root node of the the
 syntax tree, followed by evaluation of the semantic representation in
 a first-order model.
 """
+from __future__ import print_function, unicode_literals
 
 import codecs
 from nltk.sem import evaluate
@@ -49,7 +50,7 @@ def parse_sents(inputs, grammar, trace=0):
     return parses
 
 
-def root_semrep(syntree, semkey="SEM"):
+def root_semrep(syntree, semkey='SEM'):
     """
     Find the semantic representation at the root of a tree.
 
@@ -65,12 +66,12 @@ def root_semrep(syntree, semkey="SEM"):
     try:
         return node[semkey]
     except KeyError:
-        print(node, end=" ")
+        print(node, end=' ')
         print("has no specification for the feature %s" % semkey)
     raise
 
 
-def interpret_sents(inputs, grammar, semkey="SEM", trace=0):
+def interpret_sents(inputs, grammar, semkey='SEM', trace=0):
     """
     Add the semantic representation to each syntactic parse tree
     of each input sentence.
@@ -113,24 +114,24 @@ def demo_model0():
     global m0, g0
     # Initialize a valuation of non-logical constants."""
     v = [
-        ("john", "b1"),
-        ("mary", "g1"),
-        ("suzie", "g2"),
-        ("fido", "d1"),
-        ("tess", "d2"),
-        ("noosa", "n"),
-        ("girl", set(["g1", "g2"])),
-        ("boy", set(["b1", "b2"])),
-        ("dog", set(["d1", "d2"])),
-        ("bark", set(["d1", "d2"])),
-        ("walk", set(["b1", "g2", "d1"])),
-        ("chase", set([("b1", "g1"), ("b2", "g1"), ("g1", "d1"), ("g2", "d2")])),
+        ('john', 'b1'),
+        ('mary', 'g1'),
+        ('suzie', 'g2'),
+        ('fido', 'd1'),
+        ('tess', 'd2'),
+        ('noosa', 'n'),
+        ('girl', set(['g1', 'g2'])),
+        ('boy', set(['b1', 'b2'])),
+        ('dog', set(['d1', 'd2'])),
+        ('bark', set(['d1', 'd2'])),
+        ('walk', set(['b1', 'g2', 'd1'])),
+        ('chase', set([('b1', 'g1'), ('b2', 'g1'), ('g1', 'd1'), ('g2', 'd2')])),
         (
-            "see",
-            set([("b1", "g1"), ("b2", "d2"), ("g1", "b1"), ("d2", "b1"), ("g2", "n")]),
+            'see',
+            set([('b1', 'g1'), ('b2', 'd2'), ('g1', 'b1'), ('d2', 'b1'), ('g2', 'n')]),
         ),
-        ("in", set([("b1", "n"), ("b2", "n"), ("d2", "n")])),
-        ("with", set([("b1", "g1"), ("g1", "b1"), ("d1", "b1"), ("b1", "d1")])),
+        ('in', set([('b1', 'n'), ('b2', 'n'), ('d2', 'n')])),
+        ('with', set([('b1', 'g1'), ('g1', 'b1'), ('d1', 'b1'), ('b1', 'd1')])),
     ]
     # Read in the data from ``v``
     val = evaluate.Valuation(v)
@@ -142,13 +143,13 @@ def demo_model0():
     g0 = evaluate.Assignment(dom)
 
 
-def read_sents(filename, encoding="utf8"):
-    with codecs.open(filename, "r", encoding) as fp:
+def read_sents(filename, encoding='utf8'):
+    with codecs.open(filename, 'r', encoding) as fp:
         sents = [l.rstrip() for l in fp]
 
     # get rid of blank lines
     sents = [l for l in sents if len(l) > 0]
-    sents = [l for l in sents if not l[0] == "#"]
+    sents = [l for l in sents if not l[0] == '#']
     return sents
 
 
@@ -170,7 +171,7 @@ def demo_legacy_grammar():
     )
     print("Reading grammar: %s" % g)
     print("*" * 20)
-    for reading in interpret_sents(["hello"], g, semkey="sem"):
+    for reading in interpret_sents(['hello'], g, semkey='sem'):
         syn, sem = reading[0]
         print()
         print("output: ", sem)
@@ -191,9 +192,9 @@ def demo():
         beta=True,
         syntrace=0,
         semtrace=0,
-        demo="default",
-        grammar="",
-        sentences="",
+        demo='default',
+        grammar='',
+        sentences='',
     )
 
     opts.add_option(
@@ -251,20 +252,20 @@ def demo():
 
     (options, args) = opts.parse_args()
 
-    SPACER = "-" * 30
+    SPACER = '-' * 30
 
     demo_model0()
 
     sents = [
-        "Fido sees a boy with Mary",
-        "John sees Mary",
-        "every girl chases a dog",
-        "every boy chases a girl",
-        "John walks with a girl in Noosa",
-        "who walks",
+        'Fido sees a boy with Mary',
+        'John sees Mary',
+        'every girl chases a dog',
+        'every boy chases a girl',
+        'John walks with a girl in Noosa',
+        'who walks',
     ]
 
-    gramfile = "grammars/sample_grammars/sem2.fcfg"
+    gramfile = 'grammars/sample_grammars/sem2.fcfg'
 
     if options.sentences:
         sentsfile = options.sentences
@@ -287,20 +288,20 @@ def demo():
 
     for i, sent in enumerate(sents):
         n = 1
-        print("\nSentence: %s" % sent)
+        print('\nSentence: %s' % sent)
         print(SPACER)
         if options.evaluate:
 
             for (syntree, semrep, value) in evaluations[i]:
                 if isinstance(value, dict):
                     value = set(value.keys())
-                print("%d:  %s" % (n, semrep))
+                print('%d:  %s' % (n, semrep))
                 print(value)
                 n += 1
         else:
 
             for (syntree, semrep) in semreps[i]:
-                print("%d:  %s" % (n, semrep))
+                print('%d:  %s' % (n, semrep))
                 n += 1
 
 
index f31e472..8bb7c07 100644 (file)
@@ -1,7 +1,7 @@
 # -*- coding: utf-8 -*-
 # Natural Language Toolkit: Sentiment Analysis
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Ewan Klein <ewan@inf.ed.ac.uk>
 # URL: <http://nltk.org/>
 # For license information, see LICENSE.TXT
index c945f36..1186fee 100644 (file)
Binary files a/nlp_resource_data/nltk/sentiment/__pycache__/__init__.cpython-37.pyc and b/nlp_resource_data/nltk/sentiment/__pycache__/__init__.cpython-37.pyc differ
index dbf8c53..729e8d0 100644 (file)
Binary files a/nlp_resource_data/nltk/sentiment/__pycache__/sentiment_analyzer.cpython-37.pyc and b/nlp_resource_data/nltk/sentiment/__pycache__/sentiment_analyzer.cpython-37.pyc differ
index 5500f3a..ffa7c39 100644 (file)
Binary files a/nlp_resource_data/nltk/sentiment/__pycache__/util.cpython-37.pyc and b/nlp_resource_data/nltk/sentiment/__pycache__/util.cpython-37.pyc differ
index 281428c..7b38ff2 100644 (file)
Binary files a/nlp_resource_data/nltk/sentiment/__pycache__/vader.cpython-37.pyc and b/nlp_resource_data/nltk/sentiment/__pycache__/vader.cpython-37.pyc differ
index 9befdd8..4abbc5e 100644 (file)
@@ -2,7 +2,7 @@
 #
 # Natural Language Toolkit: Sentiment Analyzer
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Pierpaolo Pantone <24alsecondo@gmail.com>
 # URL: <http://nltk.org/>
 # For license information, see LICENSE.TXT
@@ -13,7 +13,7 @@ using NLTK features and classifiers, especially for teaching and demonstrative
 purposes.
 """
 
-import sys
+from __future__ import print_function
 from collections import defaultdict
 
 from nltk.classify.util import apply_features, accuracy as eval_accuracy
@@ -27,6 +27,8 @@ from nltk.metrics import (
 
 from nltk.probability import FreqDist
 
+from nltk.sentiment.util import save_file, timer
+
 
 class SentimentAnalyzer(object):
     """
@@ -180,19 +182,10 @@ class SentimentAnalyzer(object):
         print("Training classifier")
         self.classifier = trainer(training_set, **kwargs)
         if save_classifier:
-            self.save_file(self.classifier, save_classifier)
+            save_file(self.classifier, save_classifier)
 
         return self.classifier
 
-    def save_file(self, content, filename):
-        """
-        Store `content` in `filename`. Can be used to store a SentimentAnalyzer.
-        """
-        print("Saving", filename, file=sys.stderr)
-        with open(filename, 'wb') as storage_file:
-            # The protocol=2 parameter is for python2 compatibility
-            pickle.dump(content, storage_file, protocol=2)
-
     def evaluate(
         self,
         test_set,
@@ -221,7 +214,7 @@ class SentimentAnalyzer(object):
         metrics_results = {}
         if accuracy == True:
             accuracy_score = eval_accuracy(classifier, test_set)
-            metrics_results["Accuracy"] = accuracy_score
+            metrics_results['Accuracy'] = accuracy_score
 
         gold_results = defaultdict(set)
         test_results = defaultdict(set)
@@ -237,19 +230,19 @@ class SentimentAnalyzer(object):
                 precision_score = eval_precision(
                     gold_results[label], test_results[label]
                 )
-                metrics_results["Precision [{0}]".format(label)] = precision_score
+                metrics_results['Precision [{0}]'.format(label)] = precision_score
             if recall == True:
                 recall_score = eval_recall(gold_results[label], test_results[label])
-                metrics_results["Recall [{0}]".format(label)] = recall_score
+                metrics_results['Recall [{0}]'.format(label)] = recall_score
             if f_measure == True:
                 f_measure_score = eval_f_measure(
                     gold_results[label], test_results[label]
                 )
-                metrics_results["F-measure [{0}]".format(label)] = f_measure_score
+                metrics_results['F-measure [{0}]'.format(label)] = f_measure_score
 
         # Print evaluation results (in alphabetical order)
         if verbose == True:
             for result in sorted(metrics_results):
-                print("{0}: {1}".format(result, metrics_results[result]))
+                print('{0}: {1}'.format(result, metrics_results[result]))
 
         return metrics_results
index e2bf22a..334c7b7 100644 (file)
@@ -2,7 +2,7 @@
 #
 # Natural Language Toolkit: Sentiment Analyzer
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Pierpaolo Pantone <24alsecondo@gmail.com>
 # URL: <http://nltk.org/>
 # For license information, see LICENSE.TXT
@@ -10,6 +10,7 @@
 """
 Utility methods for Sentiment Analysis.
 """
+from __future__ import division
 
 import codecs
 import csv
@@ -20,6 +21,7 @@ import re
 import sys
 import time
 from copy import deepcopy
+from itertools import tee
 
 import nltk
 from nltk.corpus import CategorizedPlaintextCorpusReader
@@ -43,90 +45,90 @@ NEGATION = r"""
 
 NEGATION_RE = re.compile(NEGATION, re.VERBOSE)
 
-CLAUSE_PUNCT = r"^[.:;!?]$"
+CLAUSE_PUNCT = r'^[.:;!?]$'
 CLAUSE_PUNCT_RE = re.compile(CLAUSE_PUNCT)
 
 # Happy and sad emoticons
 
 HAPPY = set(
     [
-        ":-)",
-        ":)",
-        ";)",
-        ":o)",
-        ":]",
-        ":3",
-        ":c)",
-        ":>",
-        "=]",
-        "8)",
-        "=)",
-        ":}",
-        ":^)",
-        ":-D",
-        ":D",
-        "8-D",
-        "8D",
-        "x-D",
-        "xD",
-        "X-D",
-        "XD",
-        "=-D",
-        "=D",
-        "=-3",
-        "=3",
-        ":-))",
+        ':-)',
+        ':)',
+        ';)',
+        ':o)',
+        ':]',
+        ':3',
+        ':c)',
+        ':>',
+        '=]',
+        '8)',
+        '=)',
+        ':}',
+        ':^)',
+        ':-D',
+        ':D',
+        '8-D',
+        '8D',
+        'x-D',
+        'xD',
+        'X-D',
+        'XD',
+        '=-D',
+        '=D',
+        '=-3',
+        '=3',
+        ':-))',
         ":'-)",
         ":')",
-        ":*",
-        ":^*",
-        ">:P",
-        ":-P",
-        ":P",
-        "X-P",
-        "x-p",
-        "xp",
-        "XP",
-        ":-p",
-        ":p",
-        "=p",
-        ":-b",
-        ":b",
-        ">:)",
-        ">;)",
-        ">:-)",
-        "<3",
+        ':*',
+        ':^*',
+        '>:P',
+        ':-P',
+        ':P',
+        'X-P',
+        'x-p',
+        'xp',
+        'XP',
+        ':-p',
+        ':p',
+        '=p',
+        ':-b',
+        ':b',
+        '>:)',
+        '>;)',
+        '>:-)',
+        '<3',
     ]
 )
 
 SAD = set(
     [
-        ":L",
-        ":-/",
-        ">:/",
-        ":S",
-        ">:[",
-        ":@",
-        ":-(",
-        ":[",
-        ":-||",
-        "=L",
-        ":<",
-        ":-[",
-        ":-<",
-        "=\\",
-        "=/",
-        ">:(",
-        ":(",
-        ">.<",
+        ':L',
+        ':-/',
+        '>:/',
+        ':S',
+        '>:[',
+        ':@',
+        ':-(',
+        ':[',
+        ':-||',
+        '=L',
+        ':<',
+        ':-[',
+        ':-<',
+        '=\\',
+        '=/',
+        '>:(',
+        ':(',
+        '>.<',
         ":'-(",
         ":'(",
-        ":\\",
-        ":-c",
-        ":c",
-        ":{",
-        ">:\\",
-        ";(",
+        ':\\',
+        ':-c',
+        ':c',
+        ':{',
+        '>:\\',
+        ';(',
     ]
 )
 
@@ -146,10 +148,10 @@ def timer(method):
         # in Python 2.x round() will return a float, so we convert it to int
         secs = int(round(tot_time % 60))
         if hours == 0 and mins == 0 and secs < 10:
-            print("[TIMER] {0}(): {:.3f} seconds".format(method.__name__, tot_time))
+            print('[TIMER] {0}(): {:.3f} seconds'.format(method.__name__, tot_time))
         else:
             print(
-                "[TIMER] {0}(): {1}h {2}m {3}s".format(
+                '[TIMER] {0}(): {1}h {2}m {3}s'.format(
                     method.__name__, hours, mins, secs
                 )
             )
@@ -158,6 +160,13 @@ def timer(method):
     return timed
 
 
+def pairwise(iterable):
+    """s -> (s0,s1), (s1,s2), (s2, s3), ..."""
+    a, b = tee(iterable)
+    next(b, None)
+    return zip(a, b)
+
+
 # ////////////////////////////////////////////////////////////
 # { Feature extractor functions
 # ////////////////////////////////////////////////////////////
@@ -189,7 +198,7 @@ def extract_unigram_feats(document, unigrams, handle_negation=False):
     if handle_negation:
         document = mark_negation(document)
     for word in unigrams:
-        features["contains({0})".format(word)] = word in set(document)
+        features['contains({0})'.format(word)] = word in set(document)
     return features
 
 
@@ -212,7 +221,7 @@ def extract_bigram_feats(document, bigrams):
     """
     features = {}
     for bigr in bigrams:
-        features["contains({0} - {1})".format(bigr[0], bigr[1])] = bigr in nltk.bigrams(
+        features['contains({0} - {1})'.format(bigr[0], bigr[1])] = bigr in nltk.bigrams(
             document
         )
     return features
@@ -255,11 +264,11 @@ def mark_negation(document, double_neg_flip=False, shallow=False):
                 neg_scope = not neg_scope
                 continue
             else:
-                doc[i] += "_NEG"
+                doc[i] += '_NEG'
         elif neg_scope and CLAUSE_PUNCT_RE.search(word):
             neg_scope = not neg_scope
         elif neg_scope and not CLAUSE_PUNCT_RE.search(word):
-            doc[i] += "_NEG"
+            doc[i] += '_NEG'
 
     return document
 
@@ -268,24 +277,34 @@ def output_markdown(filename, **kwargs):
     """
     Write the output of an analysis to a file.
     """
-    with codecs.open(filename, "at") as outfile:
-        text = "\n*** \n\n"
-        text += "{0} \n\n".format(time.strftime("%d/%m/%Y, %H:%M"))
+    with codecs.open(filename, 'at') as outfile:
+        text = '\n*** \n\n'
+        text += '{0} \n\n'.format(time.strftime("%d/%m/%Y, %H:%M"))
         for k in sorted(kwargs):
             if isinstance(kwargs[k], dict):
                 dictionary = kwargs[k]
-                text += "  - **{0}:**\n".format(k)
+                text += '  - **{0}:**\n'.format(k)
                 for entry in sorted(dictionary):
-                    text += "    - {0}: {1} \n".format(entry, dictionary[entry])
+                    text += '    - {0}: {1} \n'.format(entry, dictionary[entry])
             elif isinstance(kwargs[k], list):
-                text += "  - **{0}:**\n".format(k)
+                text += '  - **{0}:**\n'.format(k)
                 for entry in kwargs[k]:
-                    text += "    - {0}\n".format(entry)
+                    text += '    - {0}\n'.format(entry)
             else:
-                text += "  - **{0}:** {1} \n".format(k, kwargs[k])
+                text += '  - **{0}:** {1} \n'.format(k, kwargs[k])
         outfile.write(text)
 
 
+def save_file(content, filename):
+    """
+    Store `content` in `filename`. Can be used to store a SentimentAnalyzer.
+    """
+    print("Saving", filename)
+    with codecs.open(filename, 'wb') as storage_file:
+        # The protocol=2 parameter is for python2 compatibility
+        pickle.dump(content, storage_file, protocol=2)
+
+
 def split_train_test(all_instances, n=None):
     """
     Randomly split `n` instances of the dataset into train and test sets.
@@ -311,20 +330,20 @@ def _show_plot(x_values, y_values, x_labels=None, y_labels=None):
         import matplotlib.pyplot as plt
     except ImportError:
         raise ImportError(
-            "The plot function requires matplotlib to be installed."
-            "See http://matplotlib.org/"
+            'The plot function requires matplotlib to be installed.'
+            'See http://matplotlib.org/'
         )
 
-    plt.locator_params(axis="y", nbins=3)
+    plt.locator_params(axis='y', nbins=3)
     axes = plt.axes()
     axes.yaxis.grid()
-    plt.plot(x_values, y_values, "ro", color="red")
+    plt.plot(x_values, y_values, 'ro', color='red')
     plt.ylim(ymin=-1.2, ymax=1.2)
     plt.tight_layout(pad=5)
     if x_labels:
-        plt.xticks(x_values, x_labels, rotation="vertical")
+        plt.xticks(x_values, x_labels, rotation='vertical')
     if y_labels:
-        plt.yticks([-1, 0, 1], y_labels, rotation="horizontal")
+        plt.yticks([-1, 0, 1], y_labels, rotation='horizontal')
     # Pad margins so that markers are not clipped by the axes
     plt.margins(0.2)
     plt.show()
@@ -339,8 +358,8 @@ def json2csv_preprocess(
     json_file,
     outfile,
     fields,
-    encoding="utf8",
-    errors="replace",
+    encoding='utf8',
+    errors='replace',
     gzip_compress=False,
     skip_retweets=True,
     skip_tongue_tweets=True,
@@ -373,7 +392,7 @@ def json2csv_preprocess(
         subsets of the original tweets json data.
     """
     with codecs.open(json_file, encoding=encoding) as fp:
-        (writer, outf) = _outf_writer(outfile, encoding, errors, gzip_compress)
+        (writer, outf) = outf_writer_compat(outfile, encoding, errors, gzip_compress)
         # write the list of fields as header
         writer.writerow(fields)
 
@@ -384,14 +403,14 @@ def json2csv_preprocess(
             tweet = json.loads(line)
             row = extract_fields(tweet, fields)
             try:
-                text = row[fields.index("text")]
+                text = row[fields.index('text')]
                 # Remove retweets
                 if skip_retweets == True:
-                    if re.search(r"\bRT\b", text):
+                    if re.search(r'\bRT\b', text):
                         continue
                 # Remove tweets containing ":P" and ":-P" emoticons
                 if skip_tongue_tweets == True:
-                    if re.search(r"\:\-?P\b", text):
+                    if re.search(r'\:\-?P\b', text):
                         continue
                 # Remove tweets containing both happy and sad emoticons
                 if skip_ambiguous_tweets == True:
@@ -401,15 +420,15 @@ def json2csv_preprocess(
                             continue
                 # Strip off emoticons from all tweets
                 if strip_off_emoticons == True:
-                    row[fields.index("text")] = re.sub(
-                        r"(?!\n)\s+", " ", EMOTICON_RE.sub("", text)
+                    row[fields.index('text')] = re.sub(
+                        r'(?!\n)\s+', ' ', EMOTICON_RE.sub('', text)
                     )
                 # Remove duplicate tweets
                 if remove_duplicates == True:
-                    if row[fields.index("text")] in tweets_cache:
+                    if row[fields.index('text')] in tweets_cache:
                         continue
                     else:
-                        tweets_cache.append(row[fields.index("text")])
+                        tweets_cache.append(row[fields.index('text')])
             except ValueError:
                 pass
             writer.writerow(row)
@@ -439,28 +458,51 @@ def parse_tweets_set(
     """
     tweets = []
     if not sent_tokenizer:
-        sent_tokenizer = load("tokenizers/punkt/english.pickle")
-
-    with codecs.open(filename, "rt") as csvfile:
-        reader = csv.reader(csvfile)
-        if skip_header == True:
-            next(reader, None)  # skip the header
-        i = 0
-        for tweet_id, text in reader:
-            # text = text[1]
-            i += 1
-            sys.stdout.write("Loaded {0} tweets\r".format(i))
-            # Apply sentence and word tokenizer to text
-            if word_tokenizer:
-                tweet = [
-                    w
-                    for sent in sent_tokenizer.tokenize(text)
-                    for w in word_tokenizer.tokenize(sent)
-                ]
-            else:
-                tweet = text
-            tweets.append((tweet, label))
-
+        sent_tokenizer = load('tokenizers/punkt/english.pickle')
+
+    # If we use Python3.x we can proceed using the 'rt' flag
+    if sys.version_info[0] == 3:
+        with codecs.open(filename, 'rt') as csvfile:
+            reader = csv.reader(csvfile)
+            if skip_header == True:
+                next(reader, None)  # skip the header
+            i = 0
+            for tweet_id, text in reader:
+                # text = text[1]
+                i += 1
+                sys.stdout.write('Loaded {0} tweets\r'.format(i))
+                # Apply sentence and word tokenizer to text
+                if word_tokenizer:
+                    tweet = [
+                        w
+                        for sent in sent_tokenizer.tokenize(text)
+                        for w in word_tokenizer.tokenize(sent)
+                    ]
+                else:
+                    tweet = text
+                tweets.append((tweet, label))
+    # If we use Python2.x we need to handle encoding problems
+    elif sys.version_info[0] < 3:
+        with codecs.open(filename) as csvfile:
+            reader = csv.reader(csvfile)
+            if skip_header == True:
+                next(reader, None)  # skip the header
+            i = 0
+            for row in reader:
+                unicode_row = [x.decode('utf8') for x in row]
+                text = unicode_row[1]
+                i += 1
+                sys.stdout.write('Loaded {0} tweets\r'.format(i))
+                # Apply sentence and word tokenizer to text
+                if word_tokenizer:
+                    tweet = [
+                        w.encode('utf8')
+                        for sent in sent_tokenizer.tokenize(text)
+                        for w in word_tokenizer.tokenize(sent)
+                    ]
+                else:
+                    tweet = text
+                tweets.append((tweet, label))
     print("Loaded {0} tweets".format(i))
     return tweets
 
@@ -496,17 +538,17 @@ def demo_tweets(trainer, n_instances=None, output=None):
     if n_instances is not None:
         n_instances = int(n_instances / 2)
 
-    fields = ["id", "text"]
+    fields = ['id', 'text']
     positive_json = twitter_samples.abspath("positive_tweets.json")
-    positive_csv = "positive_tweets.csv"
+    positive_csv = 'positive_tweets.csv'
     json2csv_preprocess(positive_json, positive_csv, fields, limit=n_instances)
 
     negative_json = twitter_samples.abspath("negative_tweets.json")
-    negative_csv = "negative_tweets.csv"
+    negative_csv = 'negative_tweets.csv'
     json2csv_preprocess(negative_json, negative_csv, fields, limit=n_instances)
 
-    neg_docs = parse_tweets_set(negative_csv, label="neg", word_tokenizer=tokenizer)
-    pos_docs = parse_tweets_set(positive_csv, label="pos", word_tokenizer=tokenizer)
+    neg_docs = parse_tweets_set(negative_csv, label='neg', word_tokenizer=tokenizer)
+    pos_docs = parse_tweets_set(positive_csv, label='pos', word_tokenizer=tokenizer)
 
     # We separately split subjective and objective instances to keep a balanced
     # uniform class distribution in both train and test sets.
@@ -542,7 +584,7 @@ def demo_tweets(trainer, n_instances=None, output=None):
         classifier.show_most_informative_features()
     except AttributeError:
         print(
-            "Your classifier does not provide a show_most_informative_features() method."
+            'Your classifier does not provide a show_most_informative_features() method.'
         )
     results = sentim_analyzer.evaluate(test_set)
 
@@ -550,7 +592,7 @@ def demo_tweets(trainer, n_instances=None, output=None):
         extr = [f.__name__ for f in sentim_analyzer.feat_extractors]
         output_markdown(
             output,
-            Dataset="labeled_tweets",
+            Dataset='labeled_tweets',
             Classifier=type(classifier).__name__,
             Tokenizer=tokenizer.__class__.__name__,
             Feats=extr,
@@ -580,12 +622,12 @@ def demo_movie_reviews(trainer, n_instances=None, output=None):
         n_instances = int(n_instances / 2)
 
     pos_docs = [
-        (list(movie_reviews.words(pos_id)), "pos")
-        for pos_id in movie_reviews.fileids("pos")[:n_instances]
+        (list(movie_reviews.words(pos_id)), 'pos')
+        for pos_id in movie_reviews.fileids('pos')[:n_instances]
     ]
     neg_docs = [
-        (list(movie_reviews.words(neg_id)), "neg")
-        for neg_id in movie_reviews.fileids("neg")[:n_instances]
+        (list(movie_reviews.words(neg_id)), 'neg')
+        for neg_id in movie_reviews.fileids('neg')[:n_instances]
     ]
     # We separately split positive and negative instances to keep a balanced
     # uniform class distribution in both train and test sets.
@@ -610,7 +652,7 @@ def demo_movie_reviews(trainer, n_instances=None, output=None):
         classifier.show_most_informative_features()
     except AttributeError:
         print(
-            "Your classifier does not provide a show_most_informative_features() method."
+            'Your classifier does not provide a show_most_informative_features() method.'
         )
     results = sentim_analyzer.evaluate(test_set)
 
@@ -618,9 +660,9 @@ def demo_movie_reviews(trainer, n_instances=None, output=None):
         extr = [f.__name__ for f in sentim_analyzer.feat_extractors]
         output_markdown(
             output,
-            Dataset="Movie_reviews",
+            Dataset='Movie_reviews',
             Classifier=type(classifier).__name__,
-            Tokenizer="WordPunctTokenizer",
+            Tokenizer='WordPunctTokenizer',
             Feats=extr,
             Results=results,
             Instances=n_instances,
@@ -648,10 +690,10 @@ def demo_subjectivity(trainer, save_analyzer=False, n_instances=None, output=Non
         n_instances = int(n_instances / 2)
 
     subj_docs = [
-        (sent, "subj") for sent in subjectivity.sents(categories="subj")[:n_instances]
+        (sent, 'subj') for sent in subjectivity.sents(categories='subj')[:n_instances]
     ]
     obj_docs = [
-        (sent, "obj") for sent in subjectivity.sents(categories="obj")[:n_instances]
+        (sent, 'obj') for sent in subjectivity.sents(categories='obj')[:n_instances]
     ]
 
     # We separately split subjective and objective instances to keep a balanced
@@ -680,20 +722,20 @@ def demo_subjectivity(trainer, save_analyzer=False, n_instances=None, output=Non
         classifier.show_most_informative_features()
     except AttributeError:
         print(
-            "Your classifier does not provide a show_most_informative_features() method."
+            'Your classifier does not provide a show_most_informative_features() method.'
         )
     results = sentim_analyzer.evaluate(test_set)
 
     if save_analyzer == True:
-        save_file(sentim_analyzer, "sa_subjectivity.pickle")
+        save_file(sentim_analyzer, 'sa_subjectivity.pickle')
 
     if output:
         extr = [f.__name__ for f in sentim_analyzer.feat_extractors]
         output_markdown(
             output,
-            Dataset="subjectivity",
+            Dataset='subjectivity',
             Classifier=type(classifier).__name__,
-            Tokenizer="WhitespaceTokenizer",
+            Tokenizer='WhitespaceTokenizer',
             Feats=extr,
             Instances=n_instances,
             Results=results,
@@ -714,10 +756,10 @@ def demo_sent_subjectivity(text):
 
     word_tokenizer = regexp.WhitespaceTokenizer()
     try:
-        sentim_analyzer = load("sa_subjectivity.pickle")
+        sentim_analyzer = load('sa_subjectivity.pickle')
     except LookupError:
-        print("Cannot find the sentiment analyzer you want to load.")
-        print("Training a new one using NaiveBayesClassifier.")
+        print('Cannot find the sentiment analyzer you want to load.')
+        print('Training a new one using NaiveBayesClassifier.')
         sentim_analyzer = demo_subjectivity(NaiveBayesClassifier.train, True)
 
     # Tokenize and convert to lower case
@@ -757,15 +799,15 @@ def demo_liu_hu_lexicon(sentence, plot=False):
             y.append(0)  # neutral
 
     if pos_words > neg_words:
-        print("Positive")
+        print('Positive')
     elif pos_words < neg_words:
-        print("Negative")
+        print('Negative')
     elif pos_words == neg_words:
-        print("Neutral")
+        print('Neutral')
 
     if plot == True:
         _show_plot(
-            x, y, x_labels=tokenized_sent, y_labels=["Negative", "Neutral", "Positive"]
+            x, y, x_labels=tokenized_sent, y_labels=['Negative', 'Neutral', 'Positive']
         )
 
 
@@ -801,9 +843,9 @@ def demo_vader_tweets(n_instances=None, output=None):
     if n_instances is not None:
         n_instances = int(n_instances / 2)
 
-    fields = ["id", "text"]
+    fields = ['id', 'text']
     positive_json = twitter_samples.abspath("positive_tweets.json")
-    positive_csv = "positive_tweets.csv"
+    positive_csv = 'positive_tweets.csv'
     json2csv_preprocess(
         positive_json,
         positive_csv,
@@ -813,7 +855,7 @@ def demo_vader_tweets(n_instances=None, output=None):
     )
 
     negative_json = twitter_samples.abspath("negative_tweets.json")
-    negative_csv = "negative_tweets.csv"
+    negative_csv = 'negative_tweets.csv'
     json2csv_preprocess(
         negative_json,
         negative_csv,
@@ -822,8 +864,8 @@ def demo_vader_tweets(n_instances=None, output=None):
         limit=n_instances,
     )
 
-    pos_docs = parse_tweets_set(positive_csv, label="pos")
-    neg_docs = parse_tweets_set(negative_csv, label="neg")
+    pos_docs = parse_tweets_set(positive_csv, label='pos')
+    neg_docs = parse_tweets_set(negative_csv, label='neg')
 
     # We separately split subjective and objective instances to keep a balanced
     # uniform class distribution in both train and test sets.
@@ -845,43 +887,43 @@ def demo_vader_tweets(n_instances=None, output=None):
         labels.add(label)
         gold_results[label].add(i)
         acc_gold_results.append(label)
-        score = vader_analyzer.polarity_scores(text)["compound"]
+        score = vader_analyzer.polarity_scores(text)['compound']
         if score > 0:
-            observed = "pos"
+            observed = 'pos'
         else:
-            observed = "neg"
+            observed = 'neg'
         num += 1
         acc_test_results.append(observed)
         test_results[observed].add(i)
     metrics_results = {}
     for label in labels:
         accuracy_score = eval_accuracy(acc_gold_results, acc_test_results)
-        metrics_results["Accuracy"] = accuracy_score
+        metrics_results['Accuracy'] = accuracy_score
         precision_score = eval_precision(gold_results[label], test_results[label])
-        metrics_results["Precision [{0}]".format(label)] = precision_score
+        metrics_results['Precision [{0}]'.format(label)] = precision_score
         recall_score = eval_recall(gold_results[label], test_results[label])
-        metrics_results["Recall [{0}]".format(label)] = recall_score
+        metrics_results['Recall [{0}]'.format(label)] = recall_score
         f_measure_score = eval_f_measure(gold_results[label], test_results[label])
-        metrics_results["F-measure [{0}]".format(label)] = f_measure_score
+        metrics_results['F-measure [{0}]'.format(label)] = f_measure_score
 
     for result in sorted(metrics_results):
-        print("{0}: {1}".format(result, metrics_results[result]))
+        print('{0}: {1}'.format(result, metrics_results[result]))
 
     if output:
         output_markdown(
             output,
-            Approach="Vader",
-            Dataset="labeled_tweets",
+            Approach='Vader',
+            Dataset='labeled_tweets',
             Instances=n_instances,
             Results=metrics_results,
         )
 
 
-if __name__ == "__main__":
+if __name__ == '__main__':
     from nltk.classify import NaiveBayesClassifier, MaxentClassifier
     from nltk.classify.scikitlearn import SklearnClassifier
     from sklearn.svm import LinearSVC
-    from nltk.twitter.common import _outf_writer, extract_fields
+    from nltk.twitter.common import outf_writer_compat, extract_fields
 
     naive_bayes = NaiveBayesClassifier.train
     svm = SklearnClassifier(LinearSVC()).train
index 7ba4251..da9fab7 100644 (file)
@@ -1,12 +1,11 @@
 # coding: utf-8
 # Natural Language Toolkit: vader
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: C.J. Hutto <Clayton.Hutto@gtri.gatech.edu>
 #         Ewan Klein <ewan@inf.ed.ac.uk> (modifications)
 #         Pierpaolo Pantone <24alsecondo@gmail.com> (modifications)
 #         George Berry <geb97@cornell.edu> (modifications)
-#         Malavika Suresh <malavika.suresh0794@gmail.com> (modifications)
 # URL: <http://nltk.org/>
 # For license information, see LICENSE.TXT
 #
@@ -26,255 +25,268 @@ import math
 import re
 import string
 from itertools import product
-
 import nltk.data
-from nltk.util import pairwise
-
-class VaderConstants:
+from .util import pairwise
+
+##Constants##
+
+# (empirically derived mean sentiment intensity rating increase for booster words)
+B_INCR = 0.293
+B_DECR = -0.293
+
+# (empirically derived mean sentiment intensity rating increase for using
+# ALLCAPs to emphasize a word)
+C_INCR = 0.733
+
+N_SCALAR = -0.74
+
+# for removing punctuation
+REGEX_REMOVE_PUNCTUATION = re.compile('[{0}]'.format(re.escape(string.punctuation)))
+
+PUNC_LIST = [
+    ".",
+    "!",
+    "?",
+    ",",
+    ";",
+    ":",
+    "-",
+    "'",
+    "\"",
+    "!!",
+    "!!!",
+    "??",
+    "???",
+    "?!?",
+    "!?!",
+    "?!?!",
+    "!?!?",
+]
+NEGATE = {
+    "aint",
+    "arent",
+    "cannot",
+    "cant",
+    "couldnt",
+    "darent",
+    "didnt",
+    "doesnt",
+    "ain't",
+    "aren't",
+    "can't",
+    "couldn't",
+    "daren't",
+    "didn't",
+    "doesn't",
+    "dont",
+    "hadnt",
+    "hasnt",
+    "havent",
+    "isnt",
+    "mightnt",
+    "mustnt",
+    "neither",
+    "don't",
+    "hadn't",
+    "hasn't",
+    "haven't",
+    "isn't",
+    "mightn't",
+    "mustn't",
+    "neednt",
+    "needn't",
+    "never",
+    "none",
+    "nope",
+    "nor",
+    "not",
+    "nothing",
+    "nowhere",
+    "oughtnt",
+    "shant",
+    "shouldnt",
+    "uhuh",
+    "wasnt",
+    "werent",
+    "oughtn't",
+    "shan't",
+    "shouldn't",
+    "uh-uh",
+    "wasn't",
+    "weren't",
+    "without",
+    "wont",
+    "wouldnt",
+    "won't",
+    "wouldn't",
+    "rarely",
+    "seldom",
+    "despite",
+}
+
+# booster/dampener 'intensifiers' or 'degree adverbs'
+# http://en.wiktionary.org/wiki/Category:English_degree_adverbs
+
+BOOSTER_DICT = {
+    "absolutely": B_INCR,
+    "amazingly": B_INCR,
+    "awfully": B_INCR,
+    "completely": B_INCR,
+    "considerably": B_INCR,
+    "decidedly": B_INCR,
+    "deeply": B_INCR,
+    "effing": B_INCR,
+    "enormously": B_INCR,
+    "entirely": B_INCR,
+    "especially": B_INCR,
+    "exceptionally": B_INCR,
+    "extremely": B_INCR,
+    "fabulously": B_INCR,
+    "flipping": B_INCR,
+    "flippin": B_INCR,
+    "fricking": B_INCR,
+    "frickin": B_INCR,
+    "frigging": B_INCR,
+    "friggin": B_INCR,
+    "fully": B_INCR,
+    "fucking": B_INCR,
+    "greatly": B_INCR,
+    "hella": B_INCR,
+    "highly": B_INCR,
+    "hugely": B_INCR,
+    "incredibly": B_INCR,
+    "intensely": B_INCR,
+    "majorly": B_INCR,
+    "more": B_INCR,
+    "most": B_INCR,
+    "particularly": B_INCR,
+    "purely": B_INCR,
+    "quite": B_INCR,
+    "really": B_INCR,
+    "remarkably": B_INCR,
+    "so": B_INCR,
+    "substantially": B_INCR,
+    "thoroughly": B_INCR,
+    "totally": B_INCR,
+    "tremendously": B_INCR,
+    "uber": B_INCR,
+    "unbelievably": B_INCR,
+    "unusually": B_INCR,
+    "utterly": B_INCR,
+    "very": B_INCR,
+    "almost": B_DECR,
+    "barely": B_DECR,
+    "hardly": B_DECR,
+    "just enough": B_DECR,
+    "kind of": B_DECR,
+    "kinda": B_DECR,
+    "kindof": B_DECR,
+    "kind-of": B_DECR,
+    "less": B_DECR,
+    "little": B_DECR,
+    "marginally": B_DECR,
+    "occasionally": B_DECR,
+    "partly": B_DECR,
+    "scarcely": B_DECR,
+    "slightly": B_DECR,
+    "somewhat": B_DECR,
+    "sort of": B_DECR,
+    "sorta": B_DECR,
+    "sortof": B_DECR,
+    "sort-of": B_DECR,
+}
+
+# check for special case idioms using a sentiment-laden keyword known to SAGE
+SPECIAL_CASE_IDIOMS = {
+    "the shit": 3,
+    "the bomb": 3,
+    "bad ass": 1.5,
+    "yeah right": -2,
+    "cut the mustard": 2,
+    "kiss of death": -1.5,
+    "hand to mouth": -2,
+}
+
+
+##Static methods##
+
+
+def negated(input_words, include_nt=True):
     """
-    A class to keep the Vader lists and constants.
+    Determine if input contains negation words
     """
-    ##Constants##
-    # (empirically derived mean sentiment intensity rating increase for booster words)
-    B_INCR = 0.293
-    B_DECR = -0.293
-
-    # (empirically derived mean sentiment intensity rating increase for using
-    # ALLCAPs to emphasize a word)
-    C_INCR = 0.733
-
-    N_SCALAR = -0.74
-
-    NEGATE = {
-        "aint",
-        "arent",
-        "cannot",
-        "cant",
-        "couldnt",
-        "darent",
-        "didnt",
-        "doesnt",
-        "ain't",
-        "aren't",
-        "can't",
-        "couldn't",
-        "daren't",
-        "didn't",
-        "doesn't",
-        "dont",
-        "hadnt",
-        "hasnt",
-        "havent",
-        "isnt",
-        "mightnt",
-        "mustnt",
-        "neither",
-        "don't",
-        "hadn't",
-        "hasn't",
-        "haven't",
-        "isn't",
-        "mightn't",
-        "mustn't",
-        "neednt",
-        "needn't",
-        "never",
-        "none",
-        "nope",
-        "nor",
-        "not",
-        "nothing",
-        "nowhere",
-        "oughtnt",
-        "shant",
-        "shouldnt",
-        "uhuh",
-        "wasnt",
-        "werent",
-        "oughtn't",
-        "shan't",
-        "shouldn't",
-        "uh-uh",
-        "wasn't",
-        "weren't",
-        "without",
-        "wont",
-        "wouldnt",
-        "won't",
-        "wouldn't",
-        "rarely",
-        "seldom",
-        "despite",
-    }
-
-    # booster/dampener 'intensifiers' or 'degree adverbs'
-    # http://en.wiktionary.org/wiki/Category:English_degree_adverbs
-
-    BOOSTER_DICT = {
-        "absolutely": B_INCR,
-        "amazingly": B_INCR,
-        "awfully": B_INCR,
-        "completely": B_INCR,
-        "considerably": B_INCR,
-        "decidedly": B_INCR,
-        "deeply": B_INCR,
-        "effing": B_INCR,
-        "enormously": B_INCR,
-        "entirely": B_INCR,
-        "especially": B_INCR,
-        "exceptionally": B_INCR,
-        "extremely": B_INCR,
-        "fabulously": B_INCR,
-        "flipping": B_INCR,
-        "flippin": B_INCR,
-        "fricking": B_INCR,
-        "frickin": B_INCR,
-        "frigging": B_INCR,
-        "friggin": B_INCR,
-        "fully": B_INCR,
-        "fucking": B_INCR,
-        "greatly": B_INCR,
-        "hella": B_INCR,
-        "highly": B_INCR,
-        "hugely": B_INCR,
-        "incredibly": B_INCR,
-        "intensely": B_INCR,
-        "majorly": B_INCR,
-        "more": B_INCR,
-        "most": B_INCR,
-        "particularly": B_INCR,
-        "purely": B_INCR,
-        "quite": B_INCR,
-        "really": B_INCR,
-        "remarkably": B_INCR,
-        "so": B_INCR,
-        "substantially": B_INCR,
-        "thoroughly": B_INCR,
-        "totally": B_INCR,
-        "tremendously": B_INCR,
-        "uber": B_INCR,
-        "unbelievably": B_INCR,
-        "unusually": B_INCR,
-        "utterly": B_INCR,
-        "very": B_INCR,
-        "almost": B_DECR,
-        "barely": B_DECR,
-        "hardly": B_DECR,
-        "just enough": B_DECR,
-        "kind of": B_DECR,
-        "kinda": B_DECR,
-        "kindof": B_DECR,
-        "kind-of": B_DECR,
-        "less": B_DECR,
-        "little": B_DECR,
-        "marginally": B_DECR,
-        "occasionally": B_DECR,
-        "partly": B_DECR,
-        "scarcely": B_DECR,
-        "slightly": B_DECR,
-        "somewhat": B_DECR,
-        "sort of": B_DECR,
-        "sorta": B_DECR,
-        "sortof": B_DECR,
-        "sort-of": B_DECR,
-    }
-
-    # check for special case idioms using a sentiment-laden keyword known to SAGE
-    SPECIAL_CASE_IDIOMS = {
-        "the shit": 3,
-        "the bomb": 3,
-        "bad ass": 1.5,
-        "yeah right": -2,
-        "cut the mustard": 2,
-        "kiss of death": -1.5,
-        "hand to mouth": -2,
-    }
-
-    # for removing punctuation
-    REGEX_REMOVE_PUNCTUATION = re.compile("[{0}]".format(re.escape(string.punctuation)))
-
-    PUNC_LIST = [
-        ".",
-        "!",
-        "?",
-        ",",
-        ";",
-        ":",
-        "-",
-        "'",
-        '"',
-        "!!",
-        "!!!",
-        "??",
-        "???",
-        "?!?",
-        "!?!",
-        "?!?!",
-        "!?!?",
-    ]
-
-    def __init__(self):
-        pass
-
-    def negated(self, input_words, include_nt=True):
-        """
-        Determine if input contains negation words
-        """
-        neg_words = self.NEGATE
-        if any(word.lower() in neg_words for word in input_words):
+    neg_words = NEGATE
+    if any(word.lower() in neg_words for word in input_words):
+        return True
+    if include_nt:
+        if any("n't" in word.lower() for word in input_words):
             return True
-        if include_nt:
-            if any("n't" in word.lower() for word in input_words):
-                return True
-        for first, second in pairwise(input_words):
-            if second.lower() == "least" and first.lower() != "at":
-                return True
-        return False
-
-    def normalize(self, score, alpha=15):
-        """
-        Normalize the score to be between -1 and 1 using an alpha that
-        approximates the max expected value
-        """
-        norm_score = score / math.sqrt((score * score) + alpha)
-        return norm_score
+    for first, second in pairwise(input_words):
+        if second.lower() == "least" and first.lower() != 'at':
+            return True
+    return False
 
 
-    def scalar_inc_dec(self, word, valence, is_cap_diff):
-        """
-        Check if the preceding words increase, decrease, or negate/nullify the
-        valence
-        """
-        scalar = 0.0
-        word_lower = word.lower()
-        if word_lower in self.BOOSTER_DICT:
-            scalar = self.BOOSTER_DICT[word_lower]
-            if valence < 0:
-                scalar *= -1
-            # check if booster/dampener word is in ALLCAPS (while others aren't)
-            if word.isupper() and is_cap_diff:
-                if valence > 0:
-                    scalar += self.C_INCR
-                else:
-                    scalar -= self.C_INCR
-        return scalar
+def normalize(score, alpha=15):
+    """
+    Normalize the score to be between -1 and 1 using an alpha that
+    approximates the max expected value
+    """
+    norm_score = score / math.sqrt((score * score) + alpha)
+    return norm_score
+
+
+def allcap_differential(words):
+    """
+    Check whether just some words in the input are ALL CAPS
+
+    :param list words: The words to inspect
+    :returns: `True` if some but not all items in `words` are ALL CAPS
+    """
+    is_different = False
+    allcap_words = 0
+    for word in words:
+        if word.isupper():
+            allcap_words += 1
+    cap_differential = len(words) - allcap_words
+    if 0 < cap_differential < len(words):
+        is_different = True
+    return is_different
+
+
+def scalar_inc_dec(word, valence, is_cap_diff):
+    """
+    Check if the preceding words increase, decrease, or negate/nullify the
+    valence
+    """
+    scalar = 0.0
+    word_lower = word.lower()
+    if word_lower in BOOSTER_DICT:
+        scalar = BOOSTER_DICT[word_lower]
+        if valence < 0:
+            scalar *= -1
+        # check if booster/dampener word is in ALLCAPS (while others aren't)
+        if word.isupper() and is_cap_diff:
+            if valence > 0:
+                scalar += C_INCR
+            else:
+                scalar -= C_INCR
+    return scalar
 
 
-class SentiText:
+class SentiText(object):
     """
     Identify sentiment-relevant string-level properties of input text.
     """
 
-    def __init__(self, text, punc_list, regex_remove_punctuation):
+    def __init__(self, text):
         if not isinstance(text, str):
-            text = str(text.encode("utf-8"))
+            text = str(text.encode('utf-8'))
         self.text = text
-        self.PUNC_LIST = punc_list
-        self.REGEX_REMOVE_PUNCTUATION = regex_remove_punctuation
         self.words_and_emoticons = self._words_and_emoticons()
-        # doesn't separate words from
+        # doesn't separate words from\
         # adjacent punctuation (keeps emoticons & contractions)
-        self.is_cap_diff = self.allcap_differential(self.words_and_emoticons)
+        self.is_cap_diff = allcap_differential(self.words_and_emoticons)
 
     def _words_plus_punc(self):
         """
@@ -284,14 +296,14 @@ class SentiText:
             ',cat': 'cat',
         }
         """
-        no_punc_text = self.REGEX_REMOVE_PUNCTUATION.sub("", self.text)
+        no_punc_text = REGEX_REMOVE_PUNCTUATION.sub('', self.text)
         # removes punctuation (but loses emoticons & contractions)
         words_only = no_punc_text.split()
         # remove singletons
         words_only = set(w for w in words_only if len(w) > 1)
         # the product gives ('cat', ',') and (',', 'cat')
-        punc_before = {"".join(p): p[1] for p in product(self.PUNC_LIST, words_only)}
-        punc_after = {"".join(p): p[0] for p in product(words_only, self.PUNC_LIST)}
+        punc_before = {''.join(p): p[1] for p in product(PUNC_LIST, words_only)}
+        punc_after = {''.join(p): p[0] for p in product(words_only, PUNC_LIST)}
         words_punc_dict = punc_before
         words_punc_dict.update(punc_after)
         return words_punc_dict
@@ -310,43 +322,25 @@ class SentiText:
                 wes[i] = words_punc_dict[we]
         return wes
 
-    def allcap_differential(self, words):
-        """
-        Check whether just some words in the input are ALL CAPS
 
-        :param list words: The words to inspect
-        :returns: `True` if some but not all items in `words` are ALL CAPS
-        """
-        is_different = False
-        allcap_words = 0
-        for word in words:
-            if word.isupper():
-                allcap_words += 1
-        cap_differential = len(words) - allcap_words
-        if 0 < cap_differential < len(words):
-            is_different = True
-        return is_different
-
-
-class SentimentIntensityAnalyzer:
+class SentimentIntensityAnalyzer(object):
     """
     Give a sentiment intensity score to sentences.
     """
 
     def __init__(
-        self, lexicon_file="sentiment/vader_lexicon.zip/vader_lexicon/vader_lexicon.txt",
+        self, lexicon_file="sentiment/vader_lexicon.zip/vader_lexicon/vader_lexicon.txt"
     ):
         self.lexicon_file = nltk.data.load(lexicon_file)
         self.lexicon = self.make_lex_dict()
-        self.constants = VaderConstants()
 
     def make_lex_dict(self):
         """
         Convert lexicon file to a dictionary
         """
         lex_dict = {}
-        for line in self.lexicon_file.split("\n"):
-            (word, measure) = line.strip().split("\t")[0:2]
+        for line in self.lexicon_file.split('\n'):
+            (word, measure) = line.strip().split('\t')[0:2]
             lex_dict[word] = float(measure)
         return lex_dict
 
@@ -356,9 +350,9 @@ class SentimentIntensityAnalyzer:
         Positive values are positive valence, negative value are negative
         valence.
         """
+        sentitext = SentiText(text)
         # text, words_and_emoticons, is_cap_diff = self.preprocess(text)
-        sentitext = SentiText(text, self.constants.PUNC_LIST,
-                              self.constants.REGEX_REMOVE_PUNCTUATION)
+
         sentiments = []
         words_and_emoticons = sentitext.words_and_emoticons
         for item in words_and_emoticons:
@@ -368,7 +362,7 @@ class SentimentIntensityAnalyzer:
                 i < len(words_and_emoticons) - 1
                 and item.lower() == "kind"
                 and words_and_emoticons[i + 1].lower() == "of"
-            ) or item.lower() in self.constants.BOOSTER_DICT:
+            ) or item.lower() in BOOSTER_DICT:
                 sentiments.append(valence)
                 continue
 
@@ -389,9 +383,9 @@ class SentimentIntensityAnalyzer:
             # check if sentiment laden word is in ALL CAPS (while others aren't)
             if item.isupper() and is_cap_diff:
                 if valence > 0:
-                    valence += self.constants.C_INCR
+                    valence += C_INCR
                 else:
-                    valence -= self.constants.C_INCR
+                    valence -= C_INCR
 
             for start_i in range(0, 3):
                 if (
@@ -402,7 +396,7 @@ class SentimentIntensityAnalyzer:
                     # dampen the scalar modifier of preceding words and emoticons
                     # (excluding the ones that immediately preceed the item) based
                     # on their distance from the current item.
-                    s = self.constants.scalar_inc_dec(
+                    s = scalar_inc_dec(
                         words_and_emoticons[i - (start_i + 1)], valence, is_cap_diff
                     )
                     if start_i == 1 and s != 0:
@@ -439,24 +433,30 @@ class SentimentIntensityAnalyzer:
                 words_and_emoticons[i - 2].lower() != "at"
                 and words_and_emoticons[i - 2].lower() != "very"
             ):
-                valence = valence * self.constants.N_SCALAR
+                valence = valence * N_SCALAR
         elif (
             i > 0
             and words_and_emoticons[i - 1].lower() not in self.lexicon
             and words_and_emoticons[i - 1].lower() == "least"
         ):
-            valence = valence * self.constants.N_SCALAR
+            valence = valence * N_SCALAR
         return valence
 
     def _but_check(self, words_and_emoticons, sentiments):
-        but = {"but", "BUT"} & set(words_and_emoticons)
-        if but:
-            bi = words_and_emoticons.index(next(iter(but)))
-            for sidx, sentiment in enumerate(sentiments):
-                if sidx < bi:
-                    sentiments[sidx] = sentiment * 0.5
-                elif sidx > bi:
-                    sentiments[sidx] = sentiment * 1.5
+        # check for modification in sentiment due to contrastive conjunction 'but'
+        if 'but' in words_and_emoticons or 'BUT' in words_and_emoticons:
+            try:
+                bi = words_and_emoticons.index('but')
+            except ValueError:
+                bi = words_and_emoticons.index('BUT')
+            for sentiment in sentiments:
+                si = sentiments.index(sentiment)
+                if si < bi:
+                    sentiments.pop(si)
+                    sentiments.insert(si, sentiment * 0.5)
+                elif si > bi:
+                    sentiments.pop(si)
+                    sentiments.insert(si, sentiment * 1.5)
         return sentiments
 
     def _idioms_check(self, valence, words_and_emoticons, i):
@@ -485,42 +485,42 @@ class SentimentIntensityAnalyzer:
         sequences = [onezero, twoonezero, twoone, threetwoone, threetwo]
 
         for seq in sequences:
-            if seq in self.constants.SPECIAL_CASE_IDIOMS:
-                valence = self.constants.SPECIAL_CASE_IDIOMS[seq]
+            if seq in SPECIAL_CASE_IDIOMS:
+                valence = SPECIAL_CASE_IDIOMS[seq]
                 break
 
         if len(words_and_emoticons) - 1 > i:
             zeroone = "{0} {1}".format(
                 words_and_emoticons[i], words_and_emoticons[i + 1]
             )
-            if zeroone in self.constants.SPECIAL_CASE_IDIOMS:
-                valence = self.constants.SPECIAL_CASE_IDIOMS[zeroone]
+            if zeroone in SPECIAL_CASE_IDIOMS:
+                valence = SPECIAL_CASE_IDIOMS[zeroone]
         if len(words_and_emoticons) - 1 > i + 1:
             zeroonetwo = "{0} {1} {2}".format(
                 words_and_emoticons[i],
                 words_and_emoticons[i + 1],
                 words_and_emoticons[i + 2],
             )
-            if zeroonetwo in self.constants.SPECIAL_CASE_IDIOMS:
-                valence = self.constants.SPECIAL_CASE_IDIOMS[zeroonetwo]
+            if zeroonetwo in SPECIAL_CASE_IDIOMS:
+                valence = SPECIAL_CASE_IDIOMS[zeroonetwo]
 
         # check for booster/dampener bi-grams such as 'sort of' or 'kind of'
-        if threetwo in self.constants.BOOSTER_DICT or twoone in self.constants.BOOSTER_DICT:
-            valence = valence + self.constants.B_DECR
+        if threetwo in BOOSTER_DICT or twoone in BOOSTER_DICT:
+            valence = valence + B_DECR
         return valence
 
     def _never_check(self, valence, words_and_emoticons, start_i, i):
         if start_i == 0:
-            if self.constants.negated([words_and_emoticons[i - 1]]):
-                valence = valence * self.constants.N_SCALAR
+            if negated([words_and_emoticons[i - 1]]):
+                valence = valence * N_SCALAR
         if start_i == 1:
             if words_and_emoticons[i - 2] == "never" and (
                 words_and_emoticons[i - 1] == "so"
                 or words_and_emoticons[i - 1] == "this"
             ):
                 valence = valence * 1.5
-            elif self.constants.negated([words_and_emoticons[i - (start_i + 1)]]):
-                valence = valence * self.constants.N_SCALAR
+            elif negated([words_and_emoticons[i - (start_i + 1)]]):
+                valence = valence * N_SCALAR
         if start_i == 2:
             if (
                 words_and_emoticons[i - 3] == "never"
@@ -534,8 +534,8 @@ class SentimentIntensityAnalyzer:
                 )
             ):
                 valence = valence * 1.25
-            elif self.constants.negated([words_and_emoticons[i - (start_i + 1)]]):
-                valence = valence * self.constants.N_SCALAR
+            elif negated([words_and_emoticons[i - (start_i + 1)]]):
+                valence = valence * N_SCALAR
         return valence
 
     def _punctuation_emphasis(self, sum_s, text):
@@ -596,7 +596,7 @@ class SentimentIntensityAnalyzer:
             elif sum_s < 0:
                 sum_s -= punct_emph_amplifier
 
-            compound = self.constants.normalize(sum_s)
+            compound = normalize(sum_s)
             # discriminate between positive, negative and neutral sentiment scores
             pos_sum, neg_sum, neu_count = self._sift_sentiment_scores(sentiments)
 
index 04efb34..d31603f 100644 (file)
@@ -1,6 +1,6 @@
 # Natural Language Toolkit: Stemmers
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Trevor Cohn <tacohn@cs.mu.oz.au>
 #         Edward Loper <edloper@gmail.com>
 #         Steven Bird <stevenbird1@gmail.com>
index 8bf7162..26b9de0 100644 (file)
Binary files a/nlp_resource_data/nltk/stem/__pycache__/__init__.cpython-37.pyc and b/nlp_resource_data/nltk/stem/__pycache__/__init__.cpython-37.pyc differ
index 79cd0fc..2357afe 100644 (file)
Binary files a/nlp_resource_data/nltk/stem/__pycache__/api.cpython-37.pyc and b/nlp_resource_data/nltk/stem/__pycache__/api.cpython-37.pyc differ
index 7732a26..a294175 100644 (file)
Binary files a/nlp_resource_data/nltk/stem/__pycache__/arlstem.cpython-37.pyc and b/nlp_resource_data/nltk/stem/__pycache__/arlstem.cpython-37.pyc differ
index 25ab911..e8f4819 100644 (file)
Binary files a/nlp_resource_data/nltk/stem/__pycache__/cistem.cpython-37.pyc and b/nlp_resource_data/nltk/stem/__pycache__/cistem.cpython-37.pyc differ
index 30f813a..d291d41 100644 (file)
Binary files a/nlp_resource_data/nltk/stem/__pycache__/isri.cpython-37.pyc and b/nlp_resource_data/nltk/stem/__pycache__/isri.cpython-37.pyc differ
index 4682904..fca3ea3 100644 (file)
Binary files a/nlp_resource_data/nltk/stem/__pycache__/lancaster.cpython-37.pyc and b/nlp_resource_data/nltk/stem/__pycache__/lancaster.cpython-37.pyc differ
index 80bc252..84bc63c 100644 (file)
Binary files a/nlp_resource_data/nltk/stem/__pycache__/porter.cpython-37.pyc and b/nlp_resource_data/nltk/stem/__pycache__/porter.cpython-37.pyc differ
index 135d65a..5da55dc 100644 (file)
Binary files a/nlp_resource_data/nltk/stem/__pycache__/regexp.cpython-37.pyc and b/nlp_resource_data/nltk/stem/__pycache__/regexp.cpython-37.pyc differ
index 3e18d37..ba7c2f1 100644 (file)
Binary files a/nlp_resource_data/nltk/stem/__pycache__/rslp.cpython-37.pyc and b/nlp_resource_data/nltk/stem/__pycache__/rslp.cpython-37.pyc differ
index a61a849..bfd6b1e 100644 (file)
Binary files a/nlp_resource_data/nltk/stem/__pycache__/snowball.cpython-37.pyc and b/nlp_resource_data/nltk/stem/__pycache__/snowball.cpython-37.pyc differ
index 4243018..0cfb576 100644 (file)
Binary files a/nlp_resource_data/nltk/stem/__pycache__/util.cpython-37.pyc and b/nlp_resource_data/nltk/stem/__pycache__/util.cpython-37.pyc differ
index 6c440a1..74899ba 100644 (file)
Binary files a/nlp_resource_data/nltk/stem/__pycache__/wordnet.cpython-37.pyc and b/nlp_resource_data/nltk/stem/__pycache__/wordnet.cpython-37.pyc differ
index dfa5c27..aa3b326 100644 (file)
@@ -1,6 +1,6 @@
 # Natural Language Toolkit: Stemmer Interface
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Trevor Cohn <tacohn@cs.mu.oz.au>
 #         Edward Loper <edloper@gmail.com>
 #         Steven Bird <stevenbird1@gmail.com>
@@ -8,9 +8,11 @@
 # For license information, see LICENSE.TXT
 
 from abc import ABCMeta, abstractmethod
+from six import add_metaclass
 
 
-class StemmerI(metaclass=ABCMeta):
+@add_metaclass(ABCMeta)
+class StemmerI(object):
     """
     A processing interface for removing morphological affixes from
     words.  This process is known as stemming.
index 86cec73..d2777c0 100644 (file)
@@ -2,7 +2,7 @@
 #
 # Natural Language Toolkit: ARLSTem Stemmer
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 #
 # Author: Kheireddine Abainia (x-programer) <k.abainia@gmail.com>
 # Algorithms: Kheireddine Abainia <k.abainia@gmail.com>
@@ -25,78 +25,79 @@ index, over-stemming index and stemming weight), and the results showed that
 ARLSTem is promising and producing high performances. This stemmer is not
 based on any dictionary and can be used on-line effectively.
 """
+from __future__ import unicode_literals
 import re
 
 from nltk.stem.api import StemmerI
 
 
 class ARLSTem(StemmerI):
-    """
+    '''
     ARLSTem stemmer : a light Arabic Stemming algorithm without any dictionary.
     Department of Telecommunication & Information Processing. USTHB University,
     Algiers, Algeria.
     ARLSTem.stem(token) returns the Arabic stem for the input token.
     The ARLSTem Stemmer requires that all tokens are encoded using Unicode
     encoding.
-    """
+    '''
 
     def __init__(self):
         # different Alif with hamza
-        self.re_hamzated_alif = re.compile(r"[\u0622\u0623\u0625]")
-        self.re_alifMaqsura = re.compile(r"[\u0649]")
-        self.re_diacritics = re.compile(r"[\u064B-\u065F]")
+        self.re_hamzated_alif = re.compile(r'[\u0622\u0623\u0625]')
+        self.re_alifMaqsura = re.compile(r'[\u0649]')
+        self.re_diacritics = re.compile(r'[\u064B-\u065F]')
 
         # Alif Laam, Laam Laam, Fa Laam, Fa Ba
-        self.pr2 = ["\u0627\u0644", "\u0644\u0644", "\u0641\u0644", "\u0641\u0628"]
+        self.pr2 = ['\u0627\u0644', '\u0644\u0644', '\u0641\u0644', '\u0641\u0628']
         # Ba Alif Laam, Kaaf Alif Laam, Waaw Alif Laam
-        self.pr3 = ["\u0628\u0627\u0644", "\u0643\u0627\u0644", "\u0648\u0627\u0644"]
+        self.pr3 = ['\u0628\u0627\u0644', '\u0643\u0627\u0644', '\u0648\u0627\u0644']
         # Fa Laam Laam, Waaw Laam Laam
-        self.pr32 = ["\u0641\u0644\u0644", "\u0648\u0644\u0644"]
+        self.pr32 = ['\u0641\u0644\u0644', '\u0648\u0644\u0644']
         # Fa Ba Alif Laam, Waaw Ba Alif Laam, Fa Kaaf Alif Laam
         self.pr4 = [
-            "\u0641\u0628\u0627\u0644",
-            "\u0648\u0628\u0627\u0644",
-            "\u0641\u0643\u0627\u0644",
+            '\u0641\u0628\u0627\u0644',
+            '\u0648\u0628\u0627\u0644',
+            '\u0641\u0643\u0627\u0644',
         ]
 
         # Kaf Yaa, Kaf Miim
-        self.su2 = ["\u0643\u064A", "\u0643\u0645"]
+        self.su2 = ['\u0643\u064A', '\u0643\u0645']
         # Ha Alif, Ha Miim
-        self.su22 = ["\u0647\u0627", "\u0647\u0645"]
+        self.su22 = ['\u0647\u0627', '\u0647\u0645']
         # Kaf Miim Alif, Kaf Noon Shadda
-        self.su3 = ["\u0643\u0645\u0627", "\u0643\u0646\u0651"]
+        self.su3 = ['\u0643\u0645\u0627', '\u0643\u0646\u0651']
         # Ha Miim Alif, Ha Noon Shadda
-        self.su32 = ["\u0647\u0645\u0627", "\u0647\u0646\u0651"]
+        self.su32 = ['\u0647\u0645\u0627', '\u0647\u0646\u0651']
 
         # Alif Noon, Ya Noon, Waaw Noon
-        self.pl_si2 = ["\u0627\u0646", "\u064A\u0646", "\u0648\u0646"]
+        self.pl_si2 = ['\u0627\u0646', '\u064A\u0646', '\u0648\u0646']
         # Taa Alif Noon, Taa Ya Noon
-        self.pl_si3 = ["\u062A\u0627\u0646", "\u062A\u064A\u0646"]
+        self.pl_si3 = ['\u062A\u0627\u0646', '\u062A\u064A\u0646']
 
         # Alif Noon, Waaw Noon
-        self.verb_su2 = ["\u0627\u0646", "\u0648\u0646"]
+        self.verb_su2 = ['\u0627\u0646', '\u0648\u0646']
         # Siin Taa, Siin Yaa
-        self.verb_pr2 = ["\u0633\u062A", "\u0633\u064A"]
+        self.verb_pr2 = ['\u0633\u062A', '\u0633\u064A']
         # Siin Alif, Siin Noon
-        self.verb_pr22 = ["\u0633\u0627", "\u0633\u0646"]
+        self.verb_pr22 = ['\u0633\u0627', '\u0633\u0646']
         # Lam Noon, Lam Taa, Lam Yaa, Lam Hamza
         self.verb_pr33 = [
-            "\u0644\u0646",
-            "\u0644\u062A",
-            "\u0644\u064A",
-            "\u0644\u0623",
+            '\u0644\u0646',
+            '\u0644\u062A',
+            '\u0644\u064A',
+            '\u0644\u0623',
         ]
         # Taa Miim Alif, Taa Noon Shadda
-        self.verb_suf3 = ["\u062A\u0645\u0627", "\u062A\u0646\u0651"]
+        self.verb_suf3 = ['\u062A\u0645\u0627', '\u062A\u0646\u0651']
         # Noon Alif, Taa Miim, Taa Alif, Waaw Alif
         self.verb_suf2 = [
-            "\u0646\u0627",
-            "\u062A\u0645",
-            "\u062A\u0627",
-            "\u0648\u0627",
+            '\u0646\u0627',
+            '\u062A\u0645',
+            '\u062A\u0627',
+            '\u0648\u0627',
         ]
         # Taa, Alif, Noon
-        self.verb_suf1 = ["\u062A", "\u0627", "\u0646"]
+        self.verb_suf1 = ['\u062A', '\u0627', '\u0646']
 
     def stem(self, token):
         """
@@ -140,14 +141,14 @@ class ARLSTem(StemmerI):
             beginning.
         """
         # strip Arabic diacritics
-        token = self.re_diacritics.sub("", token)
+        token = self.re_diacritics.sub('', token)
         # replace Hamzated Alif with Alif bare
-        token = self.re_hamzated_alif.sub("\u0627", token)
+        token = self.re_hamzated_alif.sub('\u0627', token)
         # replace alifMaqsura with Yaa
-        token = self.re_alifMaqsura.sub("\u064A", token)
+        token = self.re_alifMaqsura.sub('\u064A', token)
         # strip the Waaw from the word beginning if the remaining is 3 letters
         # at least
-        if token.startswith("\u0648") and len(token) > 3:
+        if token.startswith('\u0648') and len(token) > 3:
             token = token[1:]
         return token
 
@@ -176,7 +177,7 @@ class ARLSTem(StemmerI):
         """
             remove suffixes from the word's end.
         """
-        if token.endswith("\u0643") and len(token) > 3:
+        if token.endswith('\u0643') and len(token) > 3:
             return token[:-1]
         if len(token) > 4:
             for s2 in self.su2:
@@ -186,7 +187,7 @@ class ARLSTem(StemmerI):
             for s3 in self.su3:
                 if token.endswith(s3):
                     return token[:-3]
-        if token.endswith("\u0647") and len(token) > 3:
+        if token.endswith('\u0647') and len(token) > 3:
             token = token[:-1]
             return token
         if len(token) > 4:
@@ -197,7 +198,7 @@ class ARLSTem(StemmerI):
             for s3 in self.su32:
                 if token.endswith(s3):
                     return token[:-3]
-        if token.endswith("\u0646\u0627") and len(token) > 4:
+        if token.endswith('\u0646\u0627') and len(token) > 4:
             return token[:-2]
         return token
 
@@ -205,7 +206,7 @@ class ARLSTem(StemmerI):
         """
             transform the word from the feminine form to the masculine form.
         """
-        if token.endswith("\u0629") and len(token) > 3:
+        if token.endswith('\u0629') and len(token) > 3:
             return token[:-1]
 
     def plur2sing(self, token):
@@ -220,11 +221,11 @@ class ARLSTem(StemmerI):
             for ps3 in self.pl_si3:
                 if token.endswith(ps3):
                     return token[:-3]
-        if len(token) > 3 and token.endswith("\u0627\u062A"):
+        if len(token) > 3 and token.endswith('\u0627\u062A'):
             return token[:-2]
-        if len(token) > 3 and token.startswith("\u0627") and token[2] == "\u0627":
+        if len(token) > 3 and token.startswith('\u0627') and token[2] == '\u0627':
             return token[:2] + token[3:]
-        if len(token) > 4 and token.startswith("\u0627") and token[-2] == "\u0627":
+        if len(token) > 4 and token.startswith('\u0627') and token[-2] == '\u0627':
             return token[1:-2] + token[-1]
 
     def verb(self, token):
@@ -252,32 +253,32 @@ class ARLSTem(StemmerI):
         """
             stem the present prefixes and suffixes
         """
-        if len(token) > 5 and token.startswith("\u062A"):  # Taa
+        if len(token) > 5 and token.startswith('\u062A'):  # Taa
             for s2 in self.pl_si2:
                 if token.endswith(s2):
                     return token[1:-2]
-        if len(token) > 5 and token.startswith("\u064A"):  # Yaa
+        if len(token) > 5 and token.startswith('\u064A'):  # Yaa
             for s2 in self.verb_su2:
                 if token.endswith(s2):
                     return token[1:-2]
-        if len(token) > 4 and token.startswith("\u0627"):  # Alif
+        if len(token) > 4 and token.startswith('\u0627'):  # Alif
             # Waaw Alif
-            if len(token) > 5 and token.endswith("\u0648\u0627"):
+            if len(token) > 5 and token.endswith('\u0648\u0627'):
                 return token[1:-2]
             # Yaa
-            if token.endswith("\u064A"):
+            if token.endswith('\u064A'):
                 return token[1:-1]
             # Alif
-            if token.endswith("\u0627"):
+            if token.endswith('\u0627'):
                 return token[1:-1]
             # Noon
-            if token.endswith("\u0646"):
+            if token.endswith('\u0646'):
                 return token[1:-1]
         # ^Yaa, Noon$
-        if len(token) > 4 and token.startswith("\u064A") and token.endswith("\u0646"):
+        if len(token) > 4 and token.startswith('\u064A') and token.endswith('\u0646'):
             return token[1:-1]
         # ^Taa, Noon$
-        if len(token) > 4 and token.startswith("\u062A") and token.endswith("\u0646"):
+        if len(token) > 4 and token.startswith('\u062A') and token.endswith('\u0646'):
             return token[1:-1]
 
     def verb_t2(self, token):
@@ -299,14 +300,14 @@ class ARLSTem(StemmerI):
         if (
             len(token) > 5
             and token.startswith(self.verb_pr2[0])
-            and token.endswith("\u0646")
+            and token.endswith('\u0646')
         ):
             return token[2:-1]
         # ^Siin Yaa, Noon$
         if (
             len(token) > 5
             and token.startswith(self.verb_pr2[1])
-            and token.endswith("\u0646")
+            and token.endswith('\u0646')
         ):
             return token[2:-1]
 
@@ -335,7 +336,7 @@ class ARLSTem(StemmerI):
             for pr1 in self.verb_suf1:
                 if token.startswith(pr1):
                     return token[1:]
-            if token.startswith("\u064A"):
+            if token.startswith('\u064A'):
                 return token[1:]
 
     def verb_t5(self, token):
index ef1cc50..efbd5fb 100644 (file)
@@ -1,16 +1,18 @@
 # -*- coding: utf-8 -*-
 # Natural Language Toolkit: CISTEM Stemmer for German
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Leonie Weissweiler <l.weissweiler@outlook.de>
 # Algorithm: Leonie Weissweiler <l.weissweiler@outlook.de>
 #            Alexander Fraser <fraser@cis.lmu.de>
 # URL: <http://nltk.org/>
 # For license information, see LICENSE.TXT
 
+from __future__ import unicode_literals
 import re
 from nltk.stem.api import StemmerI
+from nltk.compat import python_2_unicode_compatible
 
-
+@python_2_unicode_compatible
 class Cistem(StemmerI):
     """
     CISTEM Stemmer for German
@@ -31,7 +33,7 @@ class Cistem(StemmerI):
     is thrice as fast as the Snowball stemmer for German while being about as fast
     as most other stemmers.
 
-    case_insensitive is a a boolean specifying if case-insensitive stemming
+    case_insensitive is a a boolean specifiying if case-insensitive stemming
     should be used. Case insensitivity improves performance only if words in the
     text may be incorrectly upper case. For all-lowercase and correctly cased
     text, best performance is achieved by setting case_insensitive for false.
@@ -39,7 +41,6 @@ class Cistem(StemmerI):
     :param case_insensitive: if True, the stemming is case insensitive. False by default.
     :type case_insensitive: bool
     """
-
     strip_ge = re.compile(r"^ge(.{4,})")
     repl_xx = re.compile(r"(.)\1")
     strip_emr = re.compile(r"e[mr]$")
@@ -136,6 +137,7 @@ class Cistem(StemmerI):
 
         return word
 
+
     def segment(self, word):
         """
         This method works very similarly to stem (:func:'cistem.stem'). The difference is that in
index 695e5fa..5e9de8a 100644 (file)
@@ -2,7 +2,7 @@
 #
 # Natural Language Toolkit: The ISRI Arabic Stemmer
 #
-# Copyright (C) 2001-2020 NLTK Proejct
+# Copyright (C) 2001-2019 NLTK Proejct
 # Algorithm: Kazem Taghva, Rania Elkhoury, and Jeffrey Coombs (2005)
 # Author: Hosam Algasaier <hosam_hme@yahoo.com>
 # URL: <http://nltk.org/>
@@ -29,13 +29,14 @@ Additional adjustments were made to improve the algorithm:
 increases the word ambiguities and changes the original root.
 
 """
+from __future__ import unicode_literals
 import re
 
 from nltk.stem.api import StemmerI
 
 
 class ISRIStemmer(StemmerI):
-    """
+    '''
     ISRI Arabic stemmer based on algorithm: Arabic Stemming without a root dictionary.
     Information Science Research Institute. University of Nevada, Las Vegas, USA.
 
@@ -47,138 +48,138 @@ class ISRIStemmer(StemmerI):
     The ISRI Stemmer requires that all tokens have Unicode string types.
     If you use Python IDLE on Arabic Windows you have to decode text first
     using Arabic '1256' coding.
-    """
+    '''
 
     def __init__(self):
         # length three prefixes
         self.p3 = [
-            "\u0643\u0627\u0644",
-            "\u0628\u0627\u0644",
-            "\u0648\u0644\u0644",
-            "\u0648\u0627\u0644",
+            '\u0643\u0627\u0644',
+            '\u0628\u0627\u0644',
+            '\u0648\u0644\u0644',
+            '\u0648\u0627\u0644',
         ]
 
         # length two prefixes
-        self.p2 = ["\u0627\u0644", "\u0644\u0644"]
+        self.p2 = ['\u0627\u0644', '\u0644\u0644']
 
         # length one prefixes
         self.p1 = [
-            "\u0644",
-            "\u0628",
-            "\u0641",
-            "\u0633",
-            "\u0648",
-            "\u064a",
-            "\u062a",
-            "\u0646",
-            "\u0627",
+            '\u0644',
+            '\u0628',
+            '\u0641',
+            '\u0633',
+            '\u0648',
+            '\u064a',
+            '\u062a',
+            '\u0646',
+            '\u0627',
         ]
 
         # length three suffixes
         self.s3 = [
-            "\u062a\u0645\u0644",
-            "\u0647\u0645\u0644",
-            "\u062a\u0627\u0646",
-            "\u062a\u064a\u0646",
-            "\u0643\u0645\u0644",
+            '\u062a\u0645\u0644',
+            '\u0647\u0645\u0644',
+            '\u062a\u0627\u0646',
+            '\u062a\u064a\u0646',
+            '\u0643\u0645\u0644',
         ]
 
         # length two suffixes
         self.s2 = [
-            "\u0648\u0646",
-            "\u0627\u062a",
-            "\u0627\u0646",
-            "\u064a\u0646",
-            "\u062a\u0646",
-            "\u0643\u0645",
-            "\u0647\u0646",
-            "\u0646\u0627",
-            "\u064a\u0627",
-            "\u0647\u0627",
-            "\u062a\u0645",
-            "\u0643\u0646",
-            "\u0646\u064a",
-            "\u0648\u0627",
-            "\u0645\u0627",
-            "\u0647\u0645",
+            '\u0648\u0646',
+            '\u0627\u062a',
+            '\u0627\u0646',
+            '\u064a\u0646',
+            '\u062a\u0646',
+            '\u0643\u0645',
+            '\u0647\u0646',
+            '\u0646\u0627',
+            '\u064a\u0627',
+            '\u0647\u0627',
+            '\u062a\u0645',
+            '\u0643\u0646',
+            '\u0646\u064a',
+            '\u0648\u0627',
+            '\u0645\u0627',
+            '\u0647\u0645',
         ]
 
         # length one suffixes
-        self.s1 = ["\u0629", "\u0647", "\u064a", "\u0643", "\u062a", "\u0627", "\u0646"]
+        self.s1 = ['\u0629', '\u0647', '\u064a', '\u0643', '\u062a', '\u0627', '\u0646']
 
         # groups of length four patterns
         self.pr4 = {
-            0: ["\u0645"],
-            1: ["\u0627"],
-            2: ["\u0627", "\u0648", "\u064A"],
-            3: ["\u0629"],
+            0: ['\u0645'],
+            1: ['\u0627'],
+            2: ['\u0627', '\u0648', '\u064A'],
+            3: ['\u0629'],
         }
 
         # Groups of length five patterns and length three roots
         self.pr53 = {
-            0: ["\u0627", "\u062a"],
-            1: ["\u0627", "\u064a", "\u0648"],
-            2: ["\u0627", "\u062a", "\u0645"],
-            3: ["\u0645", "\u064a", "\u062a"],
-            4: ["\u0645", "\u062a"],
-            5: ["\u0627", "\u0648"],
-            6: ["\u0627", "\u0645"],
+            0: ['\u0627', '\u062a'],
+            1: ['\u0627', '\u064a', '\u0648'],
+            2: ['\u0627', '\u062a', '\u0645'],
+            3: ['\u0645', '\u064a', '\u062a'],
+            4: ['\u0645', '\u062a'],
+            5: ['\u0627', '\u0648'],
+            6: ['\u0627', '\u0645'],
         }
 
-        self.re_short_vowels = re.compile(r"[\u064B-\u0652]")
-        self.re_hamza = re.compile(r"[\u0621\u0624\u0626]")
-        self.re_initial_hamza = re.compile(r"^[\u0622\u0623\u0625]")
+        self.re_short_vowels = re.compile(r'[\u064B-\u0652]')
+        self.re_hamza = re.compile(r'[\u0621\u0624\u0626]')
+        self.re_initial_hamza = re.compile(r'^[\u0622\u0623\u0625]')
 
         self.stop_words = [
-            "\u064a\u0643\u0648\u0646",
-            "\u0648\u0644\u064a\u0633",
-            "\u0648\u0643\u0627\u0646",
-            "\u0643\u0630\u0644\u0643",
-            "\u0627\u0644\u062a\u064a",
-            "\u0648\u0628\u064a\u0646",
-            "\u0639\u0644\u064a\u0647\u0627",
-            "\u0645\u0633\u0627\u0621",
-            "\u0627\u0644\u0630\u064a",
-            "\u0648\u0643\u0627\u0646\u062a",
-            "\u0648\u0644\u0643\u0646",
-            "\u0648\u0627\u0644\u062a\u064a",
-            "\u062a\u0643\u0648\u0646",
-            "\u0627\u0644\u064a\u0648\u0645",
-            "\u0627\u0644\u0644\u0630\u064a\u0646",
-            "\u0639\u0644\u064a\u0647",
-            "\u0643\u0627\u0646\u062a",
-            "\u0644\u0630\u0644\u0643",
-            "\u0623\u0645\u0627\u0645",
-            "\u0647\u0646\u0627\u0643",
-            "\u0645\u0646\u0647\u0627",
-            "\u0645\u0627\u0632\u0627\u0644",
-            "\u0644\u0627\u0632\u0627\u0644",
-            "\u0644\u0627\u064a\u0632\u0627\u0644",
-            "\u0645\u0627\u064a\u0632\u0627\u0644",
-            "\u0627\u0635\u0628\u062d",
-            "\u0623\u0635\u0628\u062d",
-            "\u0623\u0645\u0633\u0649",
-            "\u0627\u0645\u0633\u0649",
-            "\u0623\u0636\u062d\u0649",
-            "\u0627\u0636\u062d\u0649",
-            "\u0645\u0627\u0628\u0631\u062d",
-            "\u0645\u0627\u0641\u062a\u0626",
-            "\u0645\u0627\u0627\u0646\u0641\u0643",
-            "\u0644\u0627\u0633\u064a\u0645\u0627",
-            "\u0648\u0644\u0627\u064a\u0632\u0627\u0644",
-            "\u0627\u0644\u062d\u0627\u0644\u064a",
-            "\u0627\u0644\u064a\u0647\u0627",
-            "\u0627\u0644\u0630\u064a\u0646",
-            "\u0641\u0627\u0646\u0647",
-            "\u0648\u0627\u0644\u0630\u064a",
-            "\u0648\u0647\u0630\u0627",
-            "\u0644\u0647\u0630\u0627",
-            "\u0641\u0643\u0627\u0646",
-            "\u0633\u062a\u0643\u0648\u0646",
-            "\u0627\u0644\u064a\u0647",
-            "\u064a\u0645\u0643\u0646",
-            "\u0628\u0647\u0630\u0627",
-            "\u0627\u0644\u0630\u0649",
+            '\u064a\u0643\u0648\u0646',
+            '\u0648\u0644\u064a\u0633',
+            '\u0648\u0643\u0627\u0646',
+            '\u0643\u0630\u0644\u0643',
+            '\u0627\u0644\u062a\u064a',
+            '\u0648\u0628\u064a\u0646',
+            '\u0639\u0644\u064a\u0647\u0627',
+            '\u0645\u0633\u0627\u0621',
+            '\u0627\u0644\u0630\u064a',
+            '\u0648\u0643\u0627\u0646\u062a',
+            '\u0648\u0644\u0643\u0646',
+            '\u0648\u0627\u0644\u062a\u064a',
+            '\u062a\u0643\u0648\u0646',
+            '\u0627\u0644\u064a\u0648\u0645',
+            '\u0627\u0644\u0644\u0630\u064a\u0646',
+            '\u0639\u0644\u064a\u0647',
+            '\u0643\u0627\u0646\u062a',
+            '\u0644\u0630\u0644\u0643',
+            '\u0623\u0645\u0627\u0645',
+            '\u0647\u0646\u0627\u0643',
+            '\u0645\u0646\u0647\u0627',
+            '\u0645\u0627\u0632\u0627\u0644',
+            '\u0644\u0627\u0632\u0627\u0644',
+            '\u0644\u0627\u064a\u0632\u0627\u0644',
+            '\u0645\u0627\u064a\u0632\u0627\u0644',
+            '\u0627\u0635\u0628\u062d',
+            '\u0623\u0635\u0628\u062d',
+            '\u0623\u0645\u0633\u0649',
+            '\u0627\u0645\u0633\u0649',
+            '\u0623\u0636\u062d\u0649',
+            '\u0627\u0636\u062d\u0649',
+            '\u0645\u0627\u0628\u0631\u062d',
+            '\u0645\u0627\u0641\u062a\u0626',
+            '\u0645\u0627\u0627\u0646\u0641\u0643',
+            '\u0644\u0627\u0633\u064a\u0645\u0627',
+            '\u0648\u0644\u0627\u064a\u0632\u0627\u0644',
+            '\u0627\u0644\u062d\u0627\u0644\u064a',
+            '\u0627\u0644\u064a\u0647\u0627',
+            '\u0627\u0644\u0630\u064a\u0646',
+            '\u0641\u0627\u0646\u0647',
+            '\u0648\u0627\u0644\u0630\u064a',
+            '\u0648\u0647\u0630\u0627',
+            '\u0644\u0647\u0630\u0627',
+            '\u0641\u0643\u0627\u0646',
+            '\u0633\u062a\u0643\u0648\u0646',
+            '\u0627\u0644\u064a\u0647',
+            '\u064a\u0645\u0643\u0646',
+            '\u0628\u0647\u0630\u0627',
+            '\u0627\u0644\u0630\u0649',
         ]
 
     def stem(self, token):
@@ -226,12 +227,12 @@ class ISRIStemmer(StemmerI):
         num=3  both 1&2
         """
         if num == 1:
-            word = self.re_short_vowels.sub("", word)
+            word = self.re_short_vowels.sub('', word)
         elif num == 2:
-            word = self.re_initial_hamza.sub("\u0627", word)
+            word = self.re_initial_hamza.sub('\u0627', word)
         elif num == 3:
-            word = self.re_short_vowels.sub("", word)
-            word = self.re_initial_hamza.sub("\u0627", word)
+            word = self.re_short_vowels.sub('', word)
+            word = self.re_initial_hamza.sub('\u0627', word)
         return word
 
     def pre32(self, word):
@@ -260,7 +261,7 @@ class ISRIStemmer(StemmerI):
 
     def waw(self, word):
         """remove connective ‘و’ if it precedes a word beginning with ‘و’ """
-        if len(word) >= 4 and word[:2] == "\u0648\u0648":
+        if len(word) >= 4 and word[:2] == '\u0648\u0648':
             word = word[1:]
         return word
 
@@ -282,35 +283,35 @@ class ISRIStemmer(StemmerI):
 
     def pro_w53(self, word):
         """process length five patterns and extract length three roots"""
-        if word[2] in self.pr53[0] and word[0] == "\u0627":  # افتعل - افاعل
+        if word[2] in self.pr53[0] and word[0] == '\u0627':  # افتعل - افاعل
             word = word[1] + word[3:]
-        elif word[3] in self.pr53[1] and word[0] == "\u0645":  # مفعول - مفعال - مفعيل
+        elif word[3] in self.pr53[1] and word[0] == '\u0645':  # مفعول - مفعال - مفعيل
             word = word[1:3] + word[4]
-        elif word[0] in self.pr53[2] and word[4] == "\u0629":  # مفعلة - تفعلة - افعلة
+        elif word[0] in self.pr53[2] and word[4] == '\u0629':  # مفعلة - تفعلة - افعلة
             word = word[1:4]
-        elif word[0] in self.pr53[3] and word[2] == "\u062a":  # مفتعل - يفتعل - تفتعل
+        elif word[0] in self.pr53[3] and word[2] == '\u062a':  # مفتعل - يفتعل - تفتعل
             word = word[1] + word[3:]
-        elif word[0] in self.pr53[4] and word[2] == "\u0627":  # مفاعل - تفاعل
+        elif word[0] in self.pr53[4] and word[2] == '\u0627':  # مفاعل - تفاعل
             word = word[1] + word[3:]
-        elif word[2] in self.pr53[5] and word[4] == "\u0629":  # فعولة - فعالة
+        elif word[2] in self.pr53[5] and word[4] == '\u0629':  # فعولة - فعالة
             word = word[:2] + word[3]
-        elif word[0] in self.pr53[6] and word[1] == "\u0646":  # انفعل - منفعل
+        elif word[0] in self.pr53[6] and word[1] == '\u0646':  # انفعل - منفعل
             word = word[2:]
-        elif word[3] == "\u0627" and word[0] == "\u0627":  # افعال
+        elif word[3] == '\u0627' and word[0] == '\u0627':  # افعال
             word = word[1:3] + word[4]
-        elif word[4] == "\u0646" and word[3] == "\u0627":  # فعلان
+        elif word[4] == '\u0646' and word[3] == '\u0627':  # فعلان
             word = word[:3]
-        elif word[3] == "\u064a" and word[0] == "\u062a":  # تفعيل
+        elif word[3] == '\u064a' and word[0] == '\u062a':  # تفعيل
             word = word[1:3] + word[4]
-        elif word[3] == "\u0648" and word[1] == "\u0627":  # فاعول
+        elif word[3] == '\u0648' and word[1] == '\u0627':  # فاعول
             word = word[0] + word[2] + word[4]
-        elif word[2] == "\u0627" and word[1] == "\u0648":  # فواعل
+        elif word[2] == '\u0627' and word[1] == '\u0648':  # فواعل
             word = word[0] + word[3:]
-        elif word[3] == "\u0626" and word[2] == "\u0627":  # فعائل
+        elif word[3] == '\u0626' and word[2] == '\u0627':  # فعائل
             word = word[:2] + word[4]
-        elif word[4] == "\u0629" and word[1] == "\u0627":  # فاعلة
+        elif word[4] == '\u0629' and word[1] == '\u0627':  # فاعلة
             word = word[0] + word[2:4]
-        elif word[4] == "\u064a" and word[2] == "\u0627":  # فعالي
+        elif word[4] == '\u064a' and word[2] == '\u0627':  # فعالي
             word = word[:2] + word[3]
         else:
             word = self.suf1(word)  # do - normalize short sufix
@@ -322,9 +323,9 @@ class ISRIStemmer(StemmerI):
         """process length five patterns and extract length four roots"""
         if word[0] in self.pr53[2]:  # تفعلل - افعلل - مفعلل
             word = word[1:]
-        elif word[4] == "\u0629":  # فعللة
+        elif word[4] == '\u0629':  # فعللة
             word = word[:4]
-        elif word[2] == "\u0627":  # فعالل
+        elif word[2] == '\u0627':  # فعالل
             word = word[:2] + word[3:]
         return word
 
@@ -338,24 +339,24 @@ class ISRIStemmer(StemmerI):
 
     def pro_w6(self, word):
         """process length six patterns and extract length three roots"""
-        if word.startswith("\u0627\u0633\u062a") or word.startswith(
-            "\u0645\u0633\u062a"
+        if word.startswith('\u0627\u0633\u062a') or word.startswith(
+            '\u0645\u0633\u062a'
         ):  # مستفعل - استفعل
             word = word[3:]
         elif (
-            word[0] == "\u0645" and word[3] == "\u0627" and word[5] == "\u0629"
+            word[0] == '\u0645' and word[3] == '\u0627' and word[5] == '\u0629'
         ):  # مفعالة
             word = word[1:3] + word[4]
         elif (
-            word[0] == "\u0627" and word[2] == "\u062a" and word[4] == "\u0627"
+            word[0] == '\u0627' and word[2] == '\u062a' and word[4] == '\u0627'
         ):  # افتعال
             word = word[1] + word[3] + word[5]
         elif (
-            word[0] == "\u0627" and word[3] == "\u0648" and word[2] == word[4]
+            word[0] == '\u0627' and word[3] == '\u0648' and word[2] == word[4]
         ):  # افعوعل
             word = word[1] + word[4:]
         elif (
-            word[0] == "\u062a" and word[2] == "\u0627" and word[4] == "\u064a"
+            word[0] == '\u062a' and word[2] == '\u0627' and word[4] == '\u064a'
         ):  # تفاعيل   new pattern
             word = word[1] + word[3] + word[5]
         else:
@@ -366,9 +367,9 @@ class ISRIStemmer(StemmerI):
 
     def pro_w64(self, word):
         """process length six patterns and extract length four roots"""
-        if word[0] == "\u0627" and word[4] == "\u0627":  # افعلال
+        if word[0] == '\u0627' and word[4] == '\u0627':  # افعلال
             word = word[1:4] + word[5]
-        elif word.startswith("\u0645\u062a"):  # متفعلل
+        elif word.startswith('\u0645\u062a'):  # متفعلل
             word = word[2:]
         return word
 
index ef5eaa4..919a1a6 100644 (file)
@@ -1,6 +1,6 @@
 # Natural Language Toolkit: Stemmers
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Steven Tomcavage <stomcava@law.upenn.edu>
 # URL: <http://nltk.org/>
 # For license information, see LICENSE.TXT
@@ -9,11 +9,14 @@
 A word stemmer based on the Lancaster (Paice/Husk) stemming algorithm.
 Paice, Chris D. "Another Stemmer." ACM SIGIR Forum 24.3 (1990): 56-61.
 """
+from __future__ import unicode_literals
 import re
 
 from nltk.stem.api import StemmerI
+from nltk.compat import python_2_unicode_compatible
 
 
+@python_2_unicode_compatible
 class LancasterStemmer(StemmerI):
     """
     Lancaster Stemmer
@@ -267,7 +270,7 @@ class LancasterStemmer(StemmerI):
                                         word, remove_total, append_string
                                     )
                                     rule_was_applied = True
-                                    if cont_flag == ".":
+                                    if cont_flag == '.':
                                         proceed = False
                                     break
                             elif self.__isAcceptable(word, remove_total):
@@ -275,7 +278,7 @@ class LancasterStemmer(StemmerI):
                                     word, remove_total, append_string
                                 )
                                 rule_was_applied = True
-                                if cont_flag == ".":
+                                if cont_flag == '.':
                                     proceed = False
                                 break
                 # If no rules apply, the word doesn't need any more stemming
@@ -346,4 +349,4 @@ class LancasterStemmer(StemmerI):
         return word
 
     def __repr__(self):
-        return "<LancasterStemmer>"
+        return '<LancasterStemmer>'
index cb04f52..e79b8b6 100644 (file)
@@ -18,13 +18,17 @@ which includes another Python implementation and other implementations
 in many languages.
 """
 
-__docformat__ = "plaintext"
+from __future__ import print_function, unicode_literals
+
+__docformat__ = 'plaintext'
 
 import re
 
 from nltk.stem.api import StemmerI
+from nltk.compat import python_2_unicode_compatible
 
 
+@python_2_unicode_compatible
 class PorterStemmer(StemmerI):
     """
     A word stemmer based on the Porter stemming algorithm.
@@ -71,14 +75,14 @@ class PorterStemmer(StemmerI):
     For the best stemming, you should use the default NLTK_EXTENSIONS
     version. However, if you need to get the same results as either the
     original algorithm or one of Martin Porter's hosted versions for
-    compatibility with an existing implementation or dataset, you can use
+    compability with an existing implementation or dataset, you can use
     one of the other modes instead.
     """
 
     # Modes the Stemmer can be instantiated in
-    NLTK_EXTENSIONS = "NLTK_EXTENSIONS"
-    MARTIN_EXTENSIONS = "MARTIN_EXTENSIONS"
-    ORIGINAL_ALGORITHM = "ORIGINAL_ALGORITHM"
+    NLTK_EXTENSIONS = 'NLTK_EXTENSIONS'
+    MARTIN_EXTENSIONS = 'MARTIN_EXTENSIONS'
+    ORIGINAL_ALGORITHM = 'ORIGINAL_ALGORITHM'
 
     def __init__(self, mode=NLTK_EXTENSIONS):
         if mode not in (
@@ -118,7 +122,7 @@ class PorterStemmer(StemmerI):
                 for val in irregular_forms[key]:
                     self.pool[val] = key
 
-        self.vowels = frozenset(["a", "e", "i", "o", "u"])
+        self.vowels = frozenset(['a', 'e', 'i', 'o', 'u'])
 
     def _is_consonant(self, word, i):
         """Returns True if word[i] is a consonant, False otherwise
@@ -134,7 +138,7 @@ class PorterStemmer(StemmerI):
         """
         if word[i] in self.vowels:
             return False
-        if word[i] == "y":
+        if word[i] == 'y':
             if i == 0:
                 return True
             else:
@@ -175,7 +179,7 @@ class PorterStemmer(StemmerI):
                 m=1    TROUBLE,  OATS,  TREES,  IVY.
                 m=2    TROUBLES,  PRIVATE,  OATEN,  ORRERY.
         """
-        cv_sequence = ""
+        cv_sequence = ''
 
         # Construct a string of 'c's and 'v's representing whether each
         # character in `stem` is a consonant or a vowel.
@@ -183,14 +187,14 @@ class PorterStemmer(StemmerI):
         #      'architecture' becomes 'vcccvcvccvcv'
         for i in range(len(stem)):
             if self._is_consonant(stem, i):
-                cv_sequence += "c"
+                cv_sequence += 'c'
             else:
-                cv_sequence += "v"
+                cv_sequence += 'v'
 
         # Count the number of 'vc' occurences, which is equivalent to
         # the number of 'VC' occurrences in Porter's reduced form in the
         # docstring above, which is in turn equivalent to `m`
-        return cv_sequence.count("vc")
+        return cv_sequence.count('vc')
 
     def _has_positive_measure(self, stem):
         return self._measure(stem) > 0
@@ -226,7 +230,7 @@ class PorterStemmer(StemmerI):
             and self._is_consonant(word, len(word) - 3)
             and not self._is_consonant(word, len(word) - 2)
             and self._is_consonant(word, len(word) - 1)
-            and word[-1] not in ("w", "x", "y")
+            and word[-1] not in ('w', 'x', 'y')
         ) or (
             self.mode == self.NLTK_EXTENSIONS
             and len(word) == 2
@@ -237,7 +241,7 @@ class PorterStemmer(StemmerI):
     def _replace_suffix(self, word, suffix, replacement):
         """Replaces `suffix` of `word` with `replacement"""
         assert word.endswith(suffix), "Given word doesn't end with given suffix"
-        if suffix == "":
+        if suffix == '':
             return word + replacement
         else:
             return word[: -len(suffix)] + replacement
@@ -253,7 +257,7 @@ class PorterStemmer(StemmerI):
         """
         for rule in rules:
             suffix, replacement, condition = rule
-            if suffix == "*d" and self._ends_double_consonant(word):
+            if suffix == '*d' and self._ends_double_consonant(word):
                 stem = word[:-2]
                 if condition is None or condition(stem):
                     return stem + replacement
@@ -261,7 +265,7 @@ class PorterStemmer(StemmerI):
                     # Don't try any further rules
                     return word
             if word.endswith(suffix):
-                stem = self._replace_suffix(word, suffix, "")
+                stem = self._replace_suffix(word, suffix, '')
                 if condition is None or condition(stem):
                     return stem + replacement
                 else:
@@ -284,16 +288,16 @@ class PorterStemmer(StemmerI):
         # this NLTK-only rule extends the original algorithm, so
         # that 'flies'->'fli' but 'dies'->'die' etc
         if self.mode == self.NLTK_EXTENSIONS:
-            if word.endswith("ies") and len(word) == 4:
-                return self._replace_suffix(word, "ies", "ie")
+            if word.endswith('ies') and len(word) == 4:
+                return self._replace_suffix(word, 'ies', 'ie')
 
         return self._apply_rule_list(
             word,
             [
-                ("sses", "ss", None),  # SSES -> SS
-                ("ies", "i", None),  # IES  -> I
-                ("ss", "ss", None),  # SS   -> SS
-                ("s", "", None),  # S    ->
+                ('sses', 'ss', None),  # SSES -> SS
+                ('ies', 'i', None),  # IES  -> I
+                ('ss', 'ss', None),  # SS   -> SS
+                ('s', '', None),  # S    ->
             ],
         )
 
@@ -333,25 +337,25 @@ class PorterStemmer(StemmerI):
         # this NLTK-only block extends the original algorithm, so that
         # 'spied'->'spi' but 'died'->'die' etc
         if self.mode == self.NLTK_EXTENSIONS:
-            if word.endswith("ied"):
+            if word.endswith('ied'):
                 if len(word) == 4:
-                    return self._replace_suffix(word, "ied", "ie")
+                    return self._replace_suffix(word, 'ied', 'ie')
                 else:
-                    return self._replace_suffix(word, "ied", "i")
+                    return self._replace_suffix(word, 'ied', 'i')
 
         # (m>0) EED -> EE
-        if word.endswith("eed"):
-            stem = self._replace_suffix(word, "eed", "")
+        if word.endswith('eed'):
+            stem = self._replace_suffix(word, 'eed', '')
             if self._measure(stem) > 0:
-                return stem + "ee"
+                return stem + 'ee'
             else:
                 return word
 
         rule_2_or_3_succeeded = False
 
-        for suffix in ["ed", "ing"]:
+        for suffix in ['ed', 'ing']:
             if word.endswith(suffix):
-                intermediate_stem = self._replace_suffix(word, suffix, "")
+                intermediate_stem = self._replace_suffix(word, suffix, '')
                 if self._contains_vowel(intermediate_stem):
                     rule_2_or_3_succeeded = True
                     break
@@ -362,20 +366,20 @@ class PorterStemmer(StemmerI):
         return self._apply_rule_list(
             intermediate_stem,
             [
-                ("at", "ate", None),  # AT -> ATE
-                ("bl", "ble", None),  # BL -> BLE
-                ("iz", "ize", None),  # IZ -> IZE
+                ('at', 'ate', None),  # AT -> ATE
+                ('bl', 'ble', None),  # BL -> BLE
+                ('iz', 'ize', None),  # IZ -> IZE
                 # (*d and not (*L or *S or *Z))
                 # -> single letter
                 (
-                    "*d",
+                    '*d',
                     intermediate_stem[-1],
-                    lambda stem: intermediate_stem[-1] not in ("l", "s", "z"),
+                    lambda stem: intermediate_stem[-1] not in ('l', 's', 'z'),
                 ),
                 # (m=1 and *o) -> E
                 (
-                    "",
-                    "e",
+                    '',
+                    'e',
                     lambda stem: (self._measure(stem) == 1 and self._ends_cvc(stem)),
                 ),
             ],
@@ -420,8 +424,8 @@ class PorterStemmer(StemmerI):
             word,
             [
                 (
-                    "y",
-                    "i",
+                    'y',
+                    'i',
                     nltk_condition
                     if self.mode == self.NLTK_EXTENSIONS
                     else original_condition,
@@ -463,39 +467,39 @@ class PorterStemmer(StemmerI):
             # Instead of applying the ALLI -> AL rule after '(a)bli' per
             # the published algorithm, instead we apply it first, and,
             # if it succeeds, run the result through step2 again.
-            if word.endswith("alli") and self._has_positive_measure(
-                self._replace_suffix(word, "alli", "")
+            if word.endswith('alli') and self._has_positive_measure(
+                self._replace_suffix(word, 'alli', '')
             ):
-                return self._step2(self._replace_suffix(word, "alli", "al"))
+                return self._step2(self._replace_suffix(word, 'alli', 'al'))
 
-        bli_rule = ("bli", "ble", self._has_positive_measure)
-        abli_rule = ("abli", "able", self._has_positive_measure)
+        bli_rule = ('bli', 'ble', self._has_positive_measure)
+        abli_rule = ('abli', 'able', self._has_positive_measure)
 
         rules = [
-            ("ational", "ate", self._has_positive_measure),
-            ("tional", "tion", self._has_positive_measure),
-            ("enci", "ence", self._has_positive_measure),
-            ("anci", "ance", self._has_positive_measure),
-            ("izer", "ize", self._has_positive_measure),
+            ('ational', 'ate', self._has_positive_measure),
+            ('tional', 'tion', self._has_positive_measure),
+            ('enci', 'ence', self._has_positive_measure),
+            ('anci', 'ance', self._has_positive_measure),
+            ('izer', 'ize', self._has_positive_measure),
             abli_rule if self.mode == self.ORIGINAL_ALGORITHM else bli_rule,
-            ("alli", "al", self._has_positive_measure),
-            ("entli", "ent", self._has_positive_measure),
-            ("eli", "e", self._has_positive_measure),
-            ("ousli", "ous", self._has_positive_measure),
-            ("ization", "ize", self._has_positive_measure),
-            ("ation", "ate", self._has_positive_measure),
-            ("ator", "ate", self._has_positive_measure),
-            ("alism", "al", self._has_positive_measure),
-            ("iveness", "ive", self._has_positive_measure),
-            ("fulness", "ful", self._has_positive_measure),
-            ("ousness", "ous", self._has_positive_measure),
-            ("aliti", "al", self._has_positive_measure),
-            ("iviti", "ive", self._has_positive_measure),
-            ("biliti", "ble", self._has_positive_measure),
+            ('alli', 'al', self._has_positive_measure),
+            ('entli', 'ent', self._has_positive_measure),
+            ('eli', 'e', self._has_positive_measure),
+            ('ousli', 'ous', self._has_positive_measure),
+            ('ization', 'ize', self._has_positive_measure),
+            ('ation', 'ate', self._has_positive_measure),
+            ('ator', 'ate', self._has_positive_measure),
+            ('alism', 'al', self._has_positive_measure),
+            ('iveness', 'ive', self._has_positive_measure),
+            ('fulness', 'ful', self._has_positive_measure),
+            ('ousness', 'ous', self._has_positive_measure),
+            ('aliti', 'al', self._has_positive_measure),
+            ('iviti', 'ive', self._has_positive_measure),
+            ('biliti', 'ble', self._has_positive_measure),
         ]
 
         if self.mode == self.NLTK_EXTENSIONS:
-            rules.append(("fulli", "ful", self._has_positive_measure))
+            rules.append(('fulli', 'ful', self._has_positive_measure))
 
             # The 'l' of the 'logi' -> 'log' rule is put with the stem,
             # so that short stems like 'geo' 'theo' etc work like
@@ -527,13 +531,13 @@ class PorterStemmer(StemmerI):
         return self._apply_rule_list(
             word,
             [
-                ("icate", "ic", self._has_positive_measure),
-                ("ative", "", self._has_positive_measure),
-                ("alize", "al", self._has_positive_measure),
-                ("iciti", "ic", self._has_positive_measure),
-                ("ical", "ic", self._has_positive_measure),
-                ("ful", "", self._has_positive_measure),
-                ("ness", "", self._has_positive_measure),
+                ('icate', 'ic', self._has_positive_measure),
+                ('ative', '', self._has_positive_measure),
+                ('alize', 'al', self._has_positive_measure),
+                ('iciti', 'ic', self._has_positive_measure),
+                ('ical', 'ic', self._has_positive_measure),
+                ('ful', '', self._has_positive_measure),
+                ('ness', '', self._has_positive_measure),
             ],
         )
 
@@ -570,30 +574,30 @@ class PorterStemmer(StemmerI):
         return self._apply_rule_list(
             word,
             [
-                ("al", "", measure_gt_1),
-                ("ance", "", measure_gt_1),
-                ("ence", "", measure_gt_1),
-                ("er", "", measure_gt_1),
-                ("ic", "", measure_gt_1),
-                ("able", "", measure_gt_1),
-                ("ible", "", measure_gt_1),
-                ("ant", "", measure_gt_1),
-                ("ement", "", measure_gt_1),
-                ("ment", "", measure_gt_1),
-                ("ent", "", measure_gt_1),
+                ('al', '', measure_gt_1),
+                ('ance', '', measure_gt_1),
+                ('ence', '', measure_gt_1),
+                ('er', '', measure_gt_1),
+                ('ic', '', measure_gt_1),
+                ('able', '', measure_gt_1),
+                ('ible', '', measure_gt_1),
+                ('ant', '', measure_gt_1),
+                ('ement', '', measure_gt_1),
+                ('ment', '', measure_gt_1),
+                ('ent', '', measure_gt_1),
                 # (m>1 and (*S or *T)) ION ->
                 (
-                    "ion",
-                    "",
-                    lambda stem: self._measure(stem) > 1 and stem[-1] in ("s", "t"),
+                    'ion',
+                    '',
+                    lambda stem: self._measure(stem) > 1 and stem[-1] in ('s', 't'),
                 ),
-                ("ou", "", measure_gt_1),
-                ("ism", "", measure_gt_1),
-                ("ate", "", measure_gt_1),
-                ("iti", "", measure_gt_1),
-                ("ous", "", measure_gt_1),
-                ("ive", "", measure_gt_1),
-                ("ize", "", measure_gt_1),
+                ('ou', '', measure_gt_1),
+                ('ism', '', measure_gt_1),
+                ('ate', '', measure_gt_1),
+                ('iti', '', measure_gt_1),
+                ('ous', '', measure_gt_1),
+                ('ive', '', measure_gt_1),
+                ('ize', '', measure_gt_1),
             ],
         )
 
@@ -625,8 +629,8 @@ class PorterStemmer(StemmerI):
         # no explicit mention of the inconsistency; you have to infer it
         # from the examples.
         # For this reason, we can't use _apply_rule_list here.
-        if word.endswith("e"):
-            stem = self._replace_suffix(word, "e", "")
+        if word.endswith('e'):
+            stem = self._replace_suffix(word, 'e', '')
             if self._measure(stem) > 1:
                 return stem
             if self._measure(stem) == 1 and not self._ends_cvc(stem):
@@ -645,7 +649,7 @@ class PorterStemmer(StemmerI):
                                     roll           ->  roll
         """
         return self._apply_rule_list(
-            word, [("ll", "l", lambda stem: self._measure(word[:-1]) > 1)]
+            word, [('ll', 'l', lambda stem: self._measure(word[:-1]) > 1)]
         )
 
     def stem(self, word):
@@ -672,7 +676,7 @@ class PorterStemmer(StemmerI):
         return stem
 
     def __repr__(self):
-        return "<PorterStemmer>"
+        return '<PorterStemmer>'
 
 
 def demo():
@@ -694,16 +698,16 @@ def demo():
             stemmed.append(stemmer.stem(word))
 
     # Convert the results to a string, and word-wrap them.
-    results = " ".join(stemmed)
-    results = re.sub(r"(.{,70})\s", r"\1\n", results + " ").rstrip()
+    results = ' '.join(stemmed)
+    results = re.sub(r"(.{,70})\s", r'\1\n', results + ' ').rstrip()
 
     # Convert the original to a string, and word wrap it.
-    original = " ".join(orig)
-    original = re.sub(r"(.{,70})\s", r"\1\n", original + " ").rstrip()
+    original = ' '.join(orig)
+    original = re.sub(r"(.{,70})\s", r'\1\n', original + ' ').rstrip()
 
     # Print the results.
-    print("-Original-".center(70).replace(" ", "*").replace("-", " "))
+    print('-Original-'.center(70).replace(' ', '*').replace('-', ' '))
     print(original)
-    print("-Results-".center(70).replace(" ", "*").replace("-", " "))
+    print('-Results-'.center(70).replace(' ', '*').replace('-', ' '))
     print(results)
-    print("*" * 70)
+    print('*' * 70)
index e00f232..8f6ead5 100644 (file)
@@ -1,16 +1,19 @@
 # Natural Language Toolkit: Stemmers
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Trevor Cohn <tacohn@cs.mu.oz.au>
 #         Edward Loper <edloper@gmail.com>
 #         Steven Bird <stevenbird1@gmail.com>
 # URL: <http://nltk.org/>
 # For license information, see LICENSE.TXT
+from __future__ import unicode_literals
 import re
 
 from nltk.stem.api import StemmerI
+from nltk.compat import python_2_unicode_compatible
 
 
+@python_2_unicode_compatible
 class RegexpStemmer(StemmerI):
     """
     A stemmer that uses regular expressions to identify morphological
@@ -41,7 +44,7 @@ class RegexpStemmer(StemmerI):
 
     def __init__(self, regexp, min=0):
 
-        if not hasattr(regexp, "pattern"):
+        if not hasattr(regexp, 'pattern'):
             regexp = re.compile(regexp)
         self._regexp = regexp
         self._min = min
@@ -50,7 +53,7 @@ class RegexpStemmer(StemmerI):
         if len(word) < self._min:
             return word
         else:
-            return self._regexp.sub("", word)
+            return self._regexp.sub('', word)
 
     def __repr__(self):
-        return "<RegexpStemmer: {!r}>".format(self._regexp.pattern)
+        return '<RegexpStemmer: {!r}>'.format(self._regexp.pattern)
index 10f5de5..06184ee 100644 (file)
@@ -2,7 +2,7 @@
 
 # Natural Language Toolkit: RSLP Stemmer
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Tiago Tresoldi <tresoldi@gmail.com>
 # URL: <http://nltk.org/>
 # For license information, see LICENSE.TXT
@@ -30,7 +30,7 @@
 # comentário, inclusive sobre o desenvolvimento de um stemmer diferente
 # e/ou melhor para o português. Também sugiro utilizar-se a lista de discussão
 # do NLTK para o português para qualquer debate.
-
+from __future__ import print_function, unicode_literals
 from nltk.data import load
 
 from nltk.stem.api import StemmerI
@@ -65,7 +65,7 @@ class RSLPStemmer(StemmerI):
         self._model.append(self.read_rule("step6.pt"))
 
     def read_rule(self, filename):
-        rules = load("nltk:stemmers/rslp/" + filename, format="raw").decode("utf8")
+        rules = load('nltk:stemmers/rslp/' + filename, format='raw').decode("utf8")
         lines = rules.split("\n")
 
         lines = [line for line in lines if line != ""]  # remove blank lines
index aede6a4..f8e9214 100644 (file)
@@ -2,7 +2,7 @@
 #
 # Natural Language Toolkit: Snowball Stemmer
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Peter Michael Stahl <pemistahl@gmail.com>
 #         Peter Ljunglof <peter.ljunglof@heatherleaf.se> (revisions)
 #         Lakhdar Benzahia <lakhdar.benzahia@gmail.com>  (co-writer)
@@ -23,9 +23,12 @@ developed by Martin Porter.
 There is also a demo function: `snowball.demo()`.
 
 """
+from __future__ import unicode_literals, print_function
 
 import re
+from six.moves import input
 
+from nltk import compat
 from nltk.corpus import stopwords
 from nltk.stem import porter
 from nltk.stem.util import suffix_replace, prefix_replace
@@ -115,6 +118,7 @@ class SnowballStemmer(StemmerI):
         return self.stemmer.stem(self, token)
 
 
+@compat.python_2_unicode_compatible
 class _LanguageSpecificStemmer(StemmerI):
 
     """
@@ -205,7 +209,7 @@ class _ScandinavianStemmer(_LanguageSpecificStemmer):
         r1 = ""
         for i in range(1, len(word)):
             if word[i] not in vowels and word[i - 1] in vowels:
-                if 3 > len(word[: i + 1]) > 0:
+                if len(word[: i + 1]) < 3 and len(word[: i + 1]) > 0:
                     r1 = word[3:]
                 elif len(word[: i + 1]) >= 3:
                     r1 = word[i + 1 :]
@@ -319,196 +323,196 @@ class ArabicStemmer(_StandardStemmer):
 
     # Normalize_pre stes
     __vocalization = re.compile(
-        r"[\u064b-\u064c-\u064d-\u064e-\u064f-\u0650-\u0651-\u0652]"
+        r'[\u064b-\u064c-\u064d-\u064e-\u064f-\u0650-\u0651-\u0652]'
     )  # ً، ٌ، ٍ، َ، ُ، ِ، ّ، ْ
 
-    __kasheeda = re.compile(r"[\u0640]")  # ـ tatweel/kasheeda
+    __kasheeda = re.compile(r'[\u0640]')  # ـ tatweel/kasheeda
 
-    __arabic_punctuation_marks = re.compile(r"[\u060C-\u061B-\u061F]")  #  ؛ ، ؟
+    __arabic_punctuation_marks = re.compile(r'[\u060C-\u061B-\u061F]')  #  ؛ ، ؟
 
     # Normalize_post
-    __last_hamzat = ("\u0623", "\u0625", "\u0622", "\u0624", "\u0626")  # أ، إ، آ، ؤ، ئ
+    __last_hamzat = ('\u0623', '\u0625', '\u0622', '\u0624', '\u0626')  # أ، إ، آ، ؤ، ئ
 
     # normalize other hamza's
-    __initial_hamzat = re.compile(r"^[\u0622\u0623\u0625]")  #  أ، إ، آ
+    __initial_hamzat = re.compile(r'^[\u0622\u0623\u0625]')  #  أ، إ، آ
 
-    __waw_hamza = re.compile(r"[\u0624]")  # ؤ
+    __waw_hamza = re.compile(r'[\u0624]')  # ؤ
 
-    __yeh_hamza = re.compile(r"[\u0626]")  # ئ
+    __yeh_hamza = re.compile(r'[\u0626]')  # ئ
 
-    __alefat = re.compile(r"[\u0623\u0622\u0625]")  #  أ، إ، آ
+    __alefat = re.compile(r'[\u0623\u0622\u0625]')  #  أ، إ، آ
 
     # Checks
     __checks1 = (
-        "\u0643\u0627\u0644",
-        "\u0628\u0627\u0644",  # بال، كال
-        "\u0627\u0644",
-        "\u0644\u0644",  # لل، ال
+        '\u0643\u0627\u0644',
+        '\u0628\u0627\u0644',  # بال، كال
+        '\u0627\u0644',
+        '\u0644\u0644',  # لل، ال
     )
 
-    __checks2 = ("\u0629", "\u0627\u062a")  # ة  #  female plural ات
+    __checks2 = ('\u0629', '\u0627\u062a')  # ة  #  female plural ات
 
     # Suffixes
     __suffix_noun_step1a = (
-        "\u064a",
-        "\u0643",
-        "\u0647",  # ي، ك، ه
-        "\u0646\u0627",
-        "\u0643\u0645",
-        "\u0647\u0627",
-        "\u0647\u0646",
-        "\u0647\u0645",  # نا، كم، ها، هن، هم
-        "\u0643\u0645\u0627",
-        "\u0647\u0645\u0627",  # كما، هما
+        '\u064a',
+        '\u0643',
+        '\u0647',  # ي، ك، ه
+        '\u0646\u0627',
+        '\u0643\u0645',
+        '\u0647\u0627',
+        '\u0647\u0646',
+        '\u0647\u0645',  # نا، كم، ها، هن، هم
+        '\u0643\u0645\u0627',
+        '\u0647\u0645\u0627',  # كما، هما
     )
 
-    __suffix_noun_step1b = "\u0646"  # ن
+    __suffix_noun_step1b = '\u0646'  # ن
 
-    __suffix_noun_step2a = ("\u0627", "\u064a", "\u0648")  # ا، ي، و
+    __suffix_noun_step2a = ('\u0627', '\u064a', '\u0648')  # ا، ي، و
 
-    __suffix_noun_step2b = "\u0627\u062a"  # ات
+    __suffix_noun_step2b = '\u0627\u062a'  # ات
 
-    __suffix_noun_step2c1 = "\u062a"  # ت
+    __suffix_noun_step2c1 = '\u062a'  # ت
 
-    __suffix_noun_step2c2 = "\u0629"  # ة
+    __suffix_noun_step2c2 = '\u0629'  # ة
 
-    __suffix_noun_step3 = "\u064a"  # ي
+    __suffix_noun_step3 = '\u064a'  # ي
 
     __suffix_verb_step1 = (
-        "\u0647",
-        "\u0643",  # ه، ك
-        "\u0646\u064a",
-        "\u0646\u0627",
-        "\u0647\u0627",
-        "\u0647\u0645",  # ني، نا، ها، هم
-        "\u0647\u0646",
-        "\u0643\u0645",
-        "\u0643\u0646",  # هن، كم، كن
-        "\u0647\u0645\u0627",
-        "\u0643\u0645\u0627",
-        "\u0643\u0645\u0648",  # هما، كما، كمو
+        '\u0647',
+        '\u0643',  # ه، ك
+        '\u0646\u064a',
+        '\u0646\u0627',
+        '\u0647\u0627',
+        '\u0647\u0645',  # ني، نا، ها، هم
+        '\u0647\u0646',
+        '\u0643\u0645',
+        '\u0643\u0646',  # هن، كم، كن
+        '\u0647\u0645\u0627',
+        '\u0643\u0645\u0627',
+        '\u0643\u0645\u0648',  # هما، كما، كمو
     )
 
     __suffix_verb_step2a = (
-        "\u062a",
-        "\u0627",
-        "\u0646",
-        "\u064a",  # ت، ا، ن، ي
-        "\u0646\u0627",
-        "\u062a\u0627",
-        "\u062a\u0646",  # نا، تا، تن Past
-        "\u0627\u0646",
-        "\u0648\u0646",
-        "\u064a\u0646",  # ان، هن، ين Present
-        "\u062a\u0645\u0627",  # تما
+        '\u062a',
+        '\u0627',
+        '\u0646',
+        '\u064a',  # ت، ا، ن، ي
+        '\u0646\u0627',
+        '\u062a\u0627',
+        '\u062a\u0646',  # نا، تا، تن Past
+        '\u0627\u0646',
+        '\u0648\u0646',
+        '\u064a\u0646',  # ان، هن، ين Present
+        '\u062a\u0645\u0627',  # تما
     )
 
-    __suffix_verb_step2b = ("\u0648\u0627", "\u062a\u0645")  # وا، تم
+    __suffix_verb_step2b = ('\u0648\u0627', '\u062a\u0645')  # وا، تم
 
-    __suffix_verb_step2c = ("\u0648", "\u062a\u0645\u0648")  # و  # تمو
+    __suffix_verb_step2c = ('\u0648', '\u062a\u0645\u0648')  # و  # تمو
 
-    __suffix_all_alef_maqsura = "\u0649"  # ى
+    __suffix_all_alef_maqsura = '\u0649'  # ى
 
     # Prefixes
     __prefix_step1 = (
-        "\u0623",  # أ
-        "\u0623\u0623",
-        "\u0623\u0622",
-        "\u0623\u0624",
-        "\u0623\u0627",
-        "\u0623\u0625",  # أأ، أآ، أؤ، أا، أإ
+        '\u0623',  # أ
+        '\u0623\u0623',
+        '\u0623\u0622',
+        '\u0623\u0624',
+        '\u0623\u0627',
+        '\u0623\u0625',  # أأ، أآ، أؤ، أا، أإ
     )
 
-    __prefix_step2a = ("\u0641\u0627\u0644", "\u0648\u0627\u0644")  # فال، وال
+    __prefix_step2a = ('\u0641\u0627\u0644', '\u0648\u0627\u0644')  # فال، وال
 
-    __prefix_step2b = ("\u0641", "\u0648")  # ف، و
+    __prefix_step2b = ('\u0641', '\u0648')  # ف، و
 
     __prefix_step3a_noun = (
-        "\u0627\u0644",
-        "\u0644\u0644",  # لل، ال
-        "\u0643\u0627\u0644",
-        "\u0628\u0627\u0644",  # بال، كال
+        '\u0627\u0644',
+        '\u0644\u0644',  # لل، ال
+        '\u0643\u0627\u0644',
+        '\u0628\u0627\u0644',  # بال، كال
     )
 
     __prefix_step3b_noun = (
-        "\u0628",
-        "\u0643",
-        "\u0644",  # ب، ك، ل
-        "\u0628\u0628",
-        "\u0643\u0643",  # بب، كك
+        '\u0628',
+        '\u0643',
+        '\u0644',  # ب، ك، ل
+        '\u0628\u0628',
+        '\u0643\u0643',  # بب، كك
     )
 
     __prefix_step3_verb = (
-        "\u0633\u064a",
-        "\u0633\u062a",
-        "\u0633\u0646",
-        "\u0633\u0623",
+        '\u0633\u064a',
+        '\u0633\u062a',
+        '\u0633\u0646',
+        '\u0633\u0623',
     )  # سي، ست، سن، سأ
 
     __prefix_step4_verb = (
-        "\u064a\u0633\u062a",
-        "\u0646\u0633\u062a",
-        "\u062a\u0633\u062a",
+        '\u064a\u0633\u062a',
+        '\u0646\u0633\u062a',
+        '\u062a\u0633\u062a',
     )  # يست، نست، تست
 
     # Suffixes added due to Conjugation Verbs
-    __conjugation_suffix_verb_1 = ("\u0647", "\u0643")  # ه، ك
+    __conjugation_suffix_verb_1 = ('\u0647', '\u0643')  # ه، ك
 
     __conjugation_suffix_verb_2 = (
-        "\u0646\u064a",
-        "\u0646\u0627",
-        "\u0647\u0627",  # ني، نا، ها
-        "\u0647\u0645",
-        "\u0647\u0646",
-        "\u0643\u0645",  # هم، هن، كم
-        "\u0643\u0646",  # كن
+        '\u0646\u064a',
+        '\u0646\u0627',
+        '\u0647\u0627',  # ني، نا، ها
+        '\u0647\u0645',
+        '\u0647\u0646',
+        '\u0643\u0645',  # هم، هن، كم
+        '\u0643\u0646',  # كن
     )
     __conjugation_suffix_verb_3 = (
-        "\u0647\u0645\u0627",
-        "\u0643\u0645\u0627",
-        "\u0643\u0645\u0648",
+        '\u0647\u0645\u0627',
+        '\u0643\u0645\u0627',
+        '\u0643\u0645\u0648',
     )  # هما، كما، كمو
 
-    __conjugation_suffix_verb_4 = ("\u0627", "\u0646", "\u064a")  # ا، ن، ي
+    __conjugation_suffix_verb_4 = ('\u0627', '\u0646', '\u064a')  # ا، ن، ي
 
     __conjugation_suffix_verb_past = (
-        "\u0646\u0627",
-        "\u062a\u0627",
-        "\u062a\u0646",
+        '\u0646\u0627',
+        '\u062a\u0627',
+        '\u062a\u0646',
     )  # نا، تا، تن
 
     __conjugation_suffix_verb_present = (
-        "\u0627\u0646",
-        "\u0648\u0646",
-        "\u064a\u0646",
+        '\u0627\u0646',
+        '\u0648\u0646',
+        '\u064a\u0646',
     )  # ان، ون، ين
 
     # Suffixes added due to derivation Names
-    __conjugation_suffix_noun_1 = ("\u064a", "\u0643", "\u0647")  # ي، ك، ه
+    __conjugation_suffix_noun_1 = ('\u064a', '\u0643', '\u0647')  # ي، ك، ه
 
     __conjugation_suffix_noun_2 = (
-        "\u0646\u0627",
-        "\u0643\u0645",  # نا، كم
-        "\u0647\u0627",
-        "\u0647\u0646",
-        "\u0647\u0645",  # ها، هن، هم
+        '\u0646\u0627',
+        '\u0643\u0645',  # نا، كم
+        '\u0647\u0627',
+        '\u0647\u0646',
+        '\u0647\u0645',  # ها، هن، هم
     )
 
     __conjugation_suffix_noun_3 = (
-        "\u0643\u0645\u0627",
-        "\u0647\u0645\u0627",
+        '\u0643\u0645\u0627',
+        '\u0647\u0645\u0627',
     )  # كما، هما
 
     # Prefixes added due to derivation Names
-    __prefixes1 = ("\u0648\u0627", "\u0641\u0627")  # فا، وا
+    __prefixes1 = ('\u0648\u0627', '\u0641\u0627')  # فا، وا
 
-    __articles_3len = ("\u0643\u0627\u0644", "\u0628\u0627\u0644")  # بال كال
+    __articles_3len = ('\u0643\u0627\u0644', '\u0628\u0627\u0644')  # بال كال
 
-    __articles_2len = ("\u0627\u0644", "\u0644\u0644")  # ال لل
+    __articles_2len = ('\u0627\u0644', '\u0644\u0644')  # ال لل
 
     # Prepositions letters
-    __prepositions1 = ("\u0643", "\u0644")  # ك، ل
-    __prepositions2 = ("\u0628\u0628", "\u0643\u0643")  # بب، كك
+    __prepositions1 = ('\u0643', '\u0644')  # ك، ل
+    __prepositions2 = ('\u0628\u0628', '\u0643\u0643')  # بب، كك
 
     is_verb = True
     is_noun = True
@@ -532,24 +536,24 @@ class ArabicStemmer(_StandardStemmer):
         :return: normalized token type string
         """
         # strip diacritics
-        token = self.__vocalization.sub("", token)
+        token = self.__vocalization.sub('', token)
         # strip kasheeda
-        token = self.__kasheeda.sub("", token)
+        token = self.__kasheeda.sub('', token)
         # strip punctuation marks
-        token = self.__arabic_punctuation_marks.sub("", token)
+        token = self.__arabic_punctuation_marks.sub('', token)
         return token
 
     def __normalize_post(self, token):
         # normalize last hamza
         for hamza in self.__last_hamzat:
             if token.endswith(hamza):
-                token = suffix_replace(token, hamza, "\u0621")
+                token = suffix_replace(token, hamza, '\u0621')
                 break
         # normalize other hamzat
-        token = self.__initial_hamzat.sub("\u0627", token)
-        token = self.__waw_hamza.sub("\u0648", token)
-        token = self.__yeh_hamza.sub("\u064a", token)
-        token = self.__alefat.sub("\u0627", token)
+        token = self.__initial_hamzat.sub('\u0627', token)
+        token = self.__waw_hamza.sub('\u0648', token)
+        token = self.__yeh_hamza.sub('\u064a', token)
+        token = self.__alefat.sub('\u0627', token)
         return token
 
     def __checks_1(self, token):
@@ -570,12 +574,12 @@ class ArabicStemmer(_StandardStemmer):
     def __checks_2(self, token):
         for suffix in self.__checks2:
             if token.endswith(suffix):
-                if suffix == "\u0629" and len(token) > 2:
+                if suffix == '\u0629' and len(token) > 2:
                     self.is_noun = True
                     self.is_verb = False
                     break
 
-                if suffix == "\u0627\u062a" and len(token) > 3:
+                if suffix == '\u0627\u062a' and len(token) > 3:
                     self.is_noun = True
                     self.is_verb = False
                     break
@@ -602,7 +606,7 @@ class ArabicStemmer(_StandardStemmer):
     def __Suffix_Verb_Step2a(self, token):
         for suffix in self.__suffix_verb_step2a:
             if token.endswith(suffix) and len(token) > 3:
-                if suffix == "\u062a" and len(token) >= 4:
+                if suffix == '\u062a' and len(token) >= 4:
                     token = token[:-1]
                     self.suffix_verb_step2a_success = True
                     break
@@ -622,7 +626,7 @@ class ArabicStemmer(_StandardStemmer):
                     self.suffix_verb_step2a_success = True
                     break
 
-                if suffix == "\u062a\u0645\u0627" and len(token) >= 6:
+                if suffix == '\u062a\u0645\u0627' and len(token) >= 6:
                     token = token[:-3]
                     self.suffix_verb_step2a_success = True
                     break
@@ -631,11 +635,11 @@ class ArabicStemmer(_StandardStemmer):
     def __Suffix_Verb_Step2c(self, token):
         for suffix in self.__suffix_verb_step2c:
             if token.endswith(suffix):
-                if suffix == "\u062a\u0645\u0648" and len(token) >= 6:
+                if suffix == '\u062a\u0645\u0648' and len(token) >= 6:
                     token = token[:-3]
                     break
 
-                if suffix == "\u0648" and len(token) >= 4:
+                if suffix == '\u0648' and len(token) >= 4:
                     token = token[:-1]
                     break
         return token
@@ -716,30 +720,30 @@ class ArabicStemmer(_StandardStemmer):
     def __Suffix_All_alef_maqsura(self, token):
         for suffix in self.__suffix_all_alef_maqsura:
             if token.endswith(suffix):
-                token = suffix_replace(token, suffix, "\u064a")
+                token = suffix_replace(token, suffix, '\u064a')
         return token
 
     def __Prefix_Step1(self, token):
         for prefix in self.__prefix_step1:
             if token.startswith(prefix) and len(token) > 3:
-                if prefix == "\u0623\u0623":
-                    token = prefix_replace(token, prefix, "\u0623")
+                if prefix == '\u0623\u0623':
+                    token = prefix_replace(token, prefix, '\u0623')
                     break
 
-                elif prefix == "\u0623\u0622":
-                    token = prefix_replace(token, prefix, "\u0622")
+                elif prefix == '\u0623\u0622':
+                    token = prefix_replace(token, prefix, '\u0622')
                     break
 
-                elif prefix == "\u0623\u0624":
-                    token = prefix_replace(token, prefix, "\u0624")
+                elif prefix == '\u0623\u0624':
+                    token = prefix_replace(token, prefix, '\u0624')
                     break
 
-                elif prefix == "\u0623\u0627":
-                    token = prefix_replace(token, prefix, "\u0627")
+                elif prefix == '\u0623\u0627':
+                    token = prefix_replace(token, prefix, '\u0627')
                     break
 
-                elif prefix == "\u0623\u0625":
-                    token = prefix_replace(token, prefix, "\u0625")
+                elif prefix == '\u0623\u0625':
+                    token = prefix_replace(token, prefix, '\u0625')
                     break
         return token
 
@@ -775,7 +779,7 @@ class ArabicStemmer(_StandardStemmer):
         for prefix in self.__prefix_step3b_noun:
             if token.startswith(prefix):
                 if len(token) > 3:
-                    if prefix == "\u0628":
+                    if prefix == '\u0628':
                         token = token[len(prefix) :]
                         self.prefix_step3b_noun_success = True
                         break
@@ -801,7 +805,7 @@ class ArabicStemmer(_StandardStemmer):
     def __Prefix_Step4_Verb(self, token):
         for prefix in self.__prefix_step4_verb:
             if token.startswith(prefix) and len(token) > 4:
-                token = prefix_replace(token, prefix, "\u0627\u0633\u062a")
+                token = prefix_replace(token, prefix, '\u0627\u0633\u062a')
                 self.is_verb = True
                 self.is_noun = False
                 break
@@ -1143,7 +1147,7 @@ class DutchStemmer(_StandardStemmer):
         # contains at least 3 letters.
         for i in range(1, len(word)):
             if word[i] not in self.__vowels and word[i - 1] in self.__vowels:
-                if 3 > len(word[: i + 1]) > 0:
+                if len(word[: i + 1]) < 3 and len(word[: i + 1]) > 0:
                     r1 = word[3:]
                 elif len(word[: i + 1]) == 0:
                     return word
@@ -1299,61 +1303,61 @@ class EnglishStemmer(_StandardStemmer):
     __step1a_suffixes = ("sses", "ied", "ies", "us", "ss", "s")
     __step1b_suffixes = ("eedly", "ingly", "edly", "eed", "ing", "ed")
     __step2_suffixes = (
-        "ization",
-        "ational",
-        "fulness",
-        "ousness",
-        "iveness",
-        "tional",
-        "biliti",
-        "lessli",
-        "entli",
-        "ation",
-        "alism",
-        "aliti",
-        "ousli",
-        "iviti",
-        "fulli",
-        "enci",
-        "anci",
-        "abli",
-        "izer",
-        "ator",
-        "alli",
-        "bli",
-        "ogi",
-        "li",
+        'ization',
+        'ational',
+        'fulness',
+        'ousness',
+        'iveness',
+        'tional',
+        'biliti',
+        'lessli',
+        'entli',
+        'ation',
+        'alism',
+        'aliti',
+        'ousli',
+        'iviti',
+        'fulli',
+        'enci',
+        'anci',
+        'abli',
+        'izer',
+        'ator',
+        'alli',
+        'bli',
+        'ogi',
+        'li',
     )
     __step3_suffixes = (
-        "ational",
-        "tional",
-        "alize",
-        "icate",
-        "iciti",
-        "ative",
-        "ical",
-        "ness",
-        "ful",
+        'ational',
+        'tional',
+        'alize',
+        'icate',
+        'iciti',
+        'ative',
+        'ical',
+        'ness',
+        'ful',
     )
     __step4_suffixes = (
-        "ement",
-        "ance",
-        "ence",
-        "able",
-        "ible",
-        "ment",
-        "ant",
-        "ent",
-        "ism",
-        "ate",
-        "iti",
-        "ous",
-        "ive",
-        "ize",
-        "ion",
-        "al",
-        "er",
-        "ic",
+        'ement',
+        'ance',
+        'ence',
+        'able',
+        'ible',
+        'ment',
+        'ant',
+        'ent',
+        'ism',
+        'ate',
+        'iti',
+        'ous',
+        'ive',
+        'ize',
+        'ion',
+        'al',
+        'er',
+        'ic',
     )
     __step5_suffixes = ("e", "l")
     __special_words = {
@@ -1835,65 +1839,65 @@ class FinnishStemmer(_StandardStemmer):
         "zz",
     )
     __step1_suffixes = (
-        "kaan",
-        "k\xE4\xE4n",
-        "sti",
-        "kin",
-        "han",
-        "h\xE4n",
-        "ko",
-        "k\xF6",
-        "pa",
-        "p\xE4",
+        'kaan',
+        'k\xE4\xE4n',
+        'sti',
+        'kin',
+        'han',
+        'h\xE4n',
+        'ko',
+        'k\xF6',
+        'pa',
+        'p\xE4',
     )
-    __step2_suffixes = ("nsa", "ns\xE4", "mme", "nne", "si", "ni", "an", "\xE4n", "en")
+    __step2_suffixes = ('nsa', 'ns\xE4', 'mme', 'nne', 'si', 'ni', 'an', '\xE4n', 'en')
     __step3_suffixes = (
-        "siin",
-        "tten",
-        "seen",
-        "han",
-        "hen",
-        "hin",
-        "hon",
-        "h\xE4n",
-        "h\xF6n",
-        "den",
-        "tta",
-        "tt\xE4",
-        "ssa",
-        "ss\xE4",
-        "sta",
-        "st\xE4",
-        "lla",
-        "ll\xE4",
-        "lta",
-        "lt\xE4",
-        "lle",
-        "ksi",
-        "ine",
-        "ta",
-        "t\xE4",
-        "na",
-        "n\xE4",
-        "a",
-        "\xE4",
-        "n",
+        'siin',
+        'tten',
+        'seen',
+        'han',
+        'hen',
+        'hin',
+        'hon',
+        'h\xE4n',
+        'h\xF6n',
+        'den',
+        'tta',
+        'tt\xE4',
+        'ssa',
+        'ss\xE4',
+        'sta',
+        'st\xE4',
+        'lla',
+        'll\xE4',
+        'lta',
+        'lt\xE4',
+        'lle',
+        'ksi',
+        'ine',
+        'ta',
+        't\xE4',
+        'na',
+        'n\xE4',
+        'a',
+        '\xE4',
+        'n',
     )
     __step4_suffixes = (
-        "impi",
-        "impa",
-        "imp\xE4",
-        "immi",
-        "imma",
-        "imm\xE4",
-        "mpi",
-        "mpa",
-        "mp\xE4",
-        "mmi",
-        "mma",
-        "mm\xE4",
-        "eja",
-        "ej\xE4",
+        'impi',
+        'impa',
+        'imp\xE4',
+        'immi',
+        'imma',
+        'imm\xE4',
+        'mpi',
+        'mpa',
+        'mp\xE4',
+        'mmi',
+        'mma',
+        'mm\xE4',
+        'eja',
+        'ej\xE4',
     )
 
     def stem(self, word):
@@ -2145,128 +2149,128 @@ class FrenchStemmer(_StandardStemmer):
 
     __vowels = "aeiouy\xE2\xE0\xEB\xE9\xEA\xE8\xEF\xEE\xF4\xFB\xF9"
     __step1_suffixes = (
-        "issements",
-        "issement",
-        "atrices",
-        "atrice",
-        "ateurs",
-        "ations",
-        "logies",
-        "usions",
-        "utions",
-        "ements",
-        "amment",
-        "emment",
-        "ances",
-        "iqUes",
-        "ismes",
-        "ables",
-        "istes",
-        "ateur",
-        "ation",
-        "logie",
-        "usion",
-        "ution",
-        "ences",
-        "ement",
-        "euses",
-        "ments",
-        "ance",
-        "iqUe",
-        "isme",
-        "able",
-        "iste",
-        "ence",
-        "it\xE9s",
-        "ives",
-        "eaux",
-        "euse",
-        "ment",
-        "eux",
-        "it\xE9",
-        "ive",
-        "ifs",
-        "aux",
-        "if",
+        'issements',
+        'issement',
+        'atrices',
+        'atrice',
+        'ateurs',
+        'ations',
+        'logies',
+        'usions',
+        'utions',
+        'ements',
+        'amment',
+        'emment',
+        'ances',
+        'iqUes',
+        'ismes',
+        'ables',
+        'istes',
+        'ateur',
+        'ation',
+        'logie',
+        'usion',
+        'ution',
+        'ences',
+        'ement',
+        'euses',
+        'ments',
+        'ance',
+        'iqUe',
+        'isme',
+        'able',
+        'iste',
+        'ence',
+        'it\xE9s',
+        'ives',
+        'eaux',
+        'euse',
+        'ment',
+        'eux',
+        'it\xE9',
+        'ive',
+        'ifs',
+        'aux',
+        'if',
     )
     __step2a_suffixes = (
-        "issaIent",
-        "issantes",
-        "iraIent",
-        "issante",
-        "issants",
-        "issions",
-        "irions",
-        "issais",
-        "issait",
-        "issant",
-        "issent",
-        "issiez",
-        "issons",
-        "irais",
-        "irait",
-        "irent",
-        "iriez",
-        "irons",
-        "iront",
-        "isses",
-        "issez",
-        "\xEEmes",
-        "\xEEtes",
-        "irai",
-        "iras",
-        "irez",
-        "isse",
-        "ies",
-        "ira",
-        "\xEEt",
-        "ie",
-        "ir",
-        "is",
-        "it",
-        "i",
+        'issaIent',
+        'issantes',
+        'iraIent',
+        'issante',
+        'issants',
+        'issions',
+        'irions',
+        'issais',
+        'issait',
+        'issant',
+        'issent',
+        'issiez',
+        'issons',
+        'irais',
+        'irait',
+        'irent',
+        'iriez',
+        'irons',
+        'iront',
+        'isses',
+        'issez',
+        '\xEEmes',
+        '\xEEtes',
+        'irai',
+        'iras',
+        'irez',
+        'isse',
+        'ies',
+        'ira',
+        '\xEEt',
+        'ie',
+        'ir',
+        'is',
+        'it',
+        'i',
     )
     __step2b_suffixes = (
-        "eraIent",
-        "assions",
-        "erions",
-        "assent",
-        "assiez",
-        "\xE8rent",
-        "erais",
-        "erait",
-        "eriez",
-        "erons",
-        "eront",
-        "aIent",
-        "antes",
-        "asses",
-        "ions",
-        "erai",
-        "eras",
-        "erez",
-        "\xE2mes",
-        "\xE2tes",
-        "ante",
-        "ants",
-        "asse",
-        "\xE9es",
-        "era",
-        "iez",
-        "ais",
-        "ait",
-        "ant",
-        "\xE9e",
-        "\xE9s",
-        "er",
-        "ez",
-        "\xE2t",
-        "ai",
-        "as",
-        "\xE9",
-        "a",
+        'eraIent',
+        'assions',
+        'erions',
+        'assent',
+        'assiez',
+        '\xE8rent',
+        'erais',
+        'erait',
+        'eriez',
+        'erons',
+        'eront',
+        'aIent',
+        'antes',
+        'asses',
+        'ions',
+        'erai',
+        'eras',
+        'erez',
+        '\xE2mes',
+        '\xE2tes',
+        'ante',
+        'ants',
+        'asse',
+        '\xE9es',
+        'era',
+        'iez',
+        'ais',
+        'ait',
+        'ant',
+        '\xE9e',
+        '\xE9s',
+        'er',
+        'ez',
+        '\xE2t',
+        'ai',
+        'as',
+        '\xE9',
+        'a',
     )
-    __step4_suffixes = ("i\xE8re", "I\xE8re", "ion", "ier", "Ier", "e", "\xEB")
+    __step4_suffixes = ('i\xE8re', 'I\xE8re', 'ion', 'ier', 'Ier', 'e', '\xEB')
 
     def stem(self, word):
         """
@@ -2485,48 +2489,48 @@ class FrenchStemmer(_StandardStemmer):
                             step2b_success = True
 
                         elif suffix in (
-                            "eraIent",
-                            "erions",
-                            "\xE8rent",
-                            "erais",
-                            "erait",
-                            "eriez",
-                            "erons",
-                            "eront",
-                            "erai",
-                            "eras",
-                            "erez",
-                            "\xE9es",
-                            "era",
-                            "iez",
-                            "\xE9e",
-                            "\xE9s",
-                            "er",
-                            "ez",
-                            "\xE9",
+                            'eraIent',
+                            'erions',
+                            '\xE8rent',
+                            'erais',
+                            'erait',
+                            'eriez',
+                            'erons',
+                            'eront',
+                            'erai',
+                            'eras',
+                            'erez',
+                            '\xE9es',
+                            'era',
+                            'iez',
+                            '\xE9e',
+                            '\xE9s',
+                            'er',
+                            'ez',
+                            '\xE9',
                         ):
                             word = word[: -len(suffix)]
                             step2b_success = True
 
                         elif suffix in (
-                            "assions",
-                            "assent",
-                            "assiez",
-                            "aIent",
-                            "antes",
-                            "asses",
-                            "\xE2mes",
-                            "\xE2tes",
-                            "ante",
-                            "ants",
-                            "asse",
-                            "ais",
-                            "ait",
-                            "ant",
-                            "\xE2t",
-                            "ai",
-                            "as",
-                            "a",
+                            'assions',
+                            'assent',
+                            'assiez',
+                            'aIent',
+                            'antes',
+                            'asses',
+                            '\xE2mes',
+                            '\xE2tes',
+                            'ante',
+                            'ants',
+                            'asse',
+                            'ais',
+                            'ait',
+                            'ant',
+                            '\xE2t',
+                            'ai',
+                            'as',
+                            'a',
                         ):
                             word = word[: -len(suffix)]
                             rv = rv[: -len(suffix)]
@@ -2681,7 +2685,7 @@ class GermanStemmer(_StandardStemmer):
         # contains at least 3 letters.
         for i in range(1, len(word)):
             if word[i] not in self.__vowels and word[i - 1] in self.__vowels:
-                if 3 > len(word[: i + 1]) > 0:
+                if len(word[: i + 1]) < 3 and len(word[: i + 1]) > 0:
                     r1 = word[3:]
                 elif len(word[: i + 1]) == 0:
                     return word
@@ -2838,151 +2842,151 @@ class HungarianStemmer(_LanguageSpecificStemmer):
 
     __step1_suffixes = ("al", "el")
     __step2_suffixes = (
-        "k\xE9ppen",
-        "onk\xE9nt",
-        "enk\xE9nt",
-        "ank\xE9nt",
-        "k\xE9pp",
-        "k\xE9nt",
-        "ban",
-        "ben",
-        "nak",
-        "nek",
-        "val",
-        "vel",
-        "t\xF3l",
-        "t\xF5l",
-        "r\xF3l",
-        "r\xF5l",
-        "b\xF3l",
-        "b\xF5l",
-        "hoz",
-        "hez",
-        "h\xF6z",
-        "n\xE1l",
-        "n\xE9l",
-        "\xE9rt",
-        "kor",
-        "ba",
-        "be",
-        "ra",
-        "re",
-        "ig",
-        "at",
-        "et",
-        "ot",
-        "\xF6t",
-        "ul",
-        "\xFCl",
-        "v\xE1",
-        "v\xE9",
-        "en",
-        "on",
-        "an",
-        "\xF6n",
-        "n",
-        "t",
+        'k\xE9ppen',
+        'onk\xE9nt',
+        'enk\xE9nt',
+        'ank\xE9nt',
+        'k\xE9pp',
+        'k\xE9nt',
+        'ban',
+        'ben',
+        'nak',
+        'nek',
+        'val',
+        'vel',
+        't\xF3l',
+        't\xF5l',
+        'r\xF3l',
+        'r\xF5l',
+        'b\xF3l',
+        'b\xF5l',
+        'hoz',
+        'hez',
+        'h\xF6z',
+        'n\xE1l',
+        'n\xE9l',
+        '\xE9rt',
+        'kor',
+        'ba',
+        'be',
+        'ra',
+        're',
+        'ig',
+        'at',
+        'et',
+        'ot',
+        '\xF6t',
+        'ul',
+        '\xFCl',
+        'v\xE1',
+        'v\xE9',
+        'en',
+        'on',
+        'an',
+        '\xF6n',
+        'n',
+        't',
     )
     __step3_suffixes = ("\xE1nk\xE9nt", "\xE1n", "\xE9n")
     __step4_suffixes = (
-        "astul",
-        "est\xFCl",
-        "\xE1stul",
-        "\xE9st\xFCl",
-        "stul",
-        "st\xFCl",
+        'astul',
+        'est\xFCl',
+        '\xE1stul',
+        '\xE9st\xFCl',
+        'stul',
+        'st\xFCl',
     )
     __step5_suffixes = ("\xE1", "\xE9")
     __step6_suffixes = (
-        "ok\xE9",
-        "\xF6k\xE9",
-        "ak\xE9",
-        "ek\xE9",
-        "\xE1k\xE9",
-        "\xE1\xE9i",
-        "\xE9k\xE9",
-        "\xE9\xE9i",
-        "k\xE9",
-        "\xE9i",
-        "\xE9\xE9",
-        "\xE9",
+        'ok\xE9',
+        '\xF6k\xE9',
+        'ak\xE9',
+        'ek\xE9',
+        '\xE1k\xE9',
+        '\xE1\xE9i',
+        '\xE9k\xE9',
+        '\xE9\xE9i',
+        'k\xE9',
+        '\xE9i',
+        '\xE9\xE9',
+        '\xE9',
     )
     __step7_suffixes = (
-        "\xE1juk",
-        "\xE9j\xFCk",
-        "\xFCnk",
-        "unk",
-        "juk",
-        "j\xFCk",
-        "\xE1nk",
-        "\xE9nk",
-        "nk",
-        "uk",
-        "\xFCk",
-        "em",
-        "om",
-        "am",
-        "od",
-        "ed",
-        "ad",
-        "\xF6d",
-        "ja",
-        "je",
-        "\xE1m",
-        "\xE1d",
-        "\xE9m",
-        "\xE9d",
-        "m",
-        "d",
-        "a",
-        "e",
-        "o",
-        "\xE1",
-        "\xE9",
+        '\xE1juk',
+        '\xE9j\xFCk',
+        '\xFCnk',
+        'unk',
+        'juk',
+        'j\xFCk',
+        '\xE1nk',
+        '\xE9nk',
+        'nk',
+        'uk',
+        '\xFCk',
+        'em',
+        'om',
+        'am',
+        'od',
+        'ed',
+        'ad',
+        '\xF6d',
+        'ja',
+        'je',
+        '\xE1m',
+        '\xE1d',
+        '\xE9m',
+        '\xE9d',
+        'm',
+        'd',
+        'a',
+        'e',
+        'o',
+        '\xE1',
+        '\xE9',
     )
     __step8_suffixes = (
-        "jaitok",
-        "jeitek",
-        "jaink",
-        "jeink",
-        "aitok",
-        "eitek",
-        "\xE1itok",
-        "\xE9itek",
-        "jaim",
-        "jeim",
-        "jaid",
-        "jeid",
-        "eink",
-        "aink",
-        "itek",
-        "jeik",
-        "jaik",
-        "\xE1ink",
-        "\xE9ink",
-        "aim",
-        "eim",
-        "aid",
-        "eid",
-        "jai",
-        "jei",
-        "ink",
-        "aik",
-        "eik",
-        "\xE1im",
-        "\xE1id",
-        "\xE1ik",
-        "\xE9im",
-        "\xE9id",
-        "\xE9ik",
-        "im",
-        "id",
-        "ai",
-        "ei",
-        "ik",
-        "\xE1i",
-        "\xE9i",
-        "i",
+        'jaitok',
+        'jeitek',
+        'jaink',
+        'jeink',
+        'aitok',
+        'eitek',
+        '\xE1itok',
+        '\xE9itek',
+        'jaim',
+        'jeim',
+        'jaid',
+        'jeid',
+        'eink',
+        'aink',
+        'itek',
+        'jeik',
+        'jaik',
+        '\xE1ink',
+        '\xE9ink',
+        'aim',
+        'eim',
+        'aid',
+        'eid',
+        'jai',
+        'jei',
+        'ink',
+        'aik',
+        'eik',
+        '\xE1im',
+        '\xE1id',
+        '\xE1ik',
+        '\xE9im',
+        '\xE9id',
+        '\xE9ik',
+        'im',
+        'id',
+        'ai',
+        'ei',
+        'ik',
+        '\xE1i',
+        '\xE9i',
+        'i',
     )
     __step9_suffixes = ("\xE1k", "\xE9k", "\xF6k", "ok", "ek", "ak", "k")
 
@@ -3207,185 +3211,185 @@ class ItalianStemmer(_StandardStemmer):
 
     __vowels = "aeiou\xE0\xE8\xEC\xF2\xF9"
     __step0_suffixes = (
-        "gliela",
-        "gliele",
-        "glieli",
-        "glielo",
-        "gliene",
-        "sene",
-        "mela",
-        "mele",
-        "meli",
-        "melo",
-        "mene",
-        "tela",
-        "tele",
-        "teli",
-        "telo",
-        "tene",
-        "cela",
-        "cele",
-        "celi",
-        "celo",
-        "cene",
-        "vela",
-        "vele",
-        "veli",
-        "velo",
-        "vene",
-        "gli",
-        "ci",
-        "la",
-        "le",
-        "li",
-        "lo",
-        "mi",
-        "ne",
-        "si",
-        "ti",
-        "vi",
+        'gliela',
+        'gliele',
+        'glieli',
+        'glielo',
+        'gliene',
+        'sene',
+        'mela',
+        'mele',
+        'meli',
+        'melo',
+        'mene',
+        'tela',
+        'tele',
+        'teli',
+        'telo',
+        'tene',
+        'cela',
+        'cele',
+        'celi',
+        'celo',
+        'cene',
+        'vela',
+        'vele',
+        'veli',
+        'velo',
+        'vene',
+        'gli',
+        'ci',
+        'la',
+        'le',
+        'li',
+        'lo',
+        'mi',
+        'ne',
+        'si',
+        'ti',
+        'vi',
     )
     __step1_suffixes = (
-        "atrice",
-        "atrici",
-        "azione",
-        "azioni",
-        "uzione",
-        "uzioni",
-        "usione",
-        "usioni",
-        "amento",
-        "amenti",
-        "imento",
-        "imenti",
-        "amente",
-        "abile",
-        "abili",
-        "ibile",
-        "ibili",
-        "mente",
-        "atore",
-        "atori",
-        "logia",
-        "logie",
-        "anza",
-        "anze",
-        "iche",
-        "ichi",
-        "ismo",
-        "ismi",
-        "ista",
-        "iste",
-        "isti",
-        "ist\xE0",
-        "ist\xE8",
-        "ist\xEC",
-        "ante",
-        "anti",
-        "enza",
-        "enze",
-        "ico",
-        "ici",
-        "ica",
-        "ice",
-        "oso",
-        "osi",
-        "osa",
-        "ose",
-        "it\xE0",
-        "ivo",
-        "ivi",
-        "iva",
-        "ive",
+        'atrice',
+        'atrici',
+        'azione',
+        'azioni',
+        'uzione',
+        'uzioni',
+        'usione',
+        'usioni',
+        'amento',
+        'amenti',
+        'imento',
+        'imenti',
+        'amente',
+        'abile',
+        'abili',
+        'ibile',
+        'ibili',
+        'mente',
+        'atore',
+        'atori',
+        'logia',
+        'logie',
+        'anza',
+        'anze',
+        'iche',
+        'ichi',
+        'ismo',
+        'ismi',
+        'ista',
+        'iste',
+        'isti',
+        'ist\xE0',
+        'ist\xE8',
+        'ist\xEC',
+        'ante',
+        'anti',
+        'enza',
+        'enze',
+        'ico',
+        'ici',
+        'ica',
+        'ice',
+        'oso',
+        'osi',
+        'osa',
+        'ose',
+        'it\xE0',
+        'ivo',
+        'ivi',
+        'iva',
+        'ive',
     )
     __step2_suffixes = (
-        "erebbero",
-        "irebbero",
-        "assero",
-        "assimo",
-        "eranno",
-        "erebbe",
-        "eremmo",
-        "ereste",
-        "eresti",
-        "essero",
-        "iranno",
-        "irebbe",
-        "iremmo",
-        "ireste",
-        "iresti",
-        "iscano",
-        "iscono",
-        "issero",
-        "arono",
-        "avamo",
-        "avano",
-        "avate",
-        "eremo",
-        "erete",
-        "erono",
-        "evamo",
-        "evano",
-        "evate",
-        "iremo",
-        "irete",
-        "irono",
-        "ivamo",
-        "ivano",
-        "ivate",
-        "ammo",
-        "ando",
-        "asse",
-        "assi",
-        "emmo",
-        "enda",
-        "ende",
-        "endi",
-        "endo",
-        "erai",
-        "erei",
-        "Yamo",
-        "iamo",
-        "immo",
-        "irai",
-        "irei",
-        "isca",
-        "isce",
-        "isci",
-        "isco",
-        "ano",
-        "are",
-        "ata",
-        "ate",
-        "ati",
-        "ato",
-        "ava",
-        "avi",
-        "avo",
-        "er\xE0",
-        "ere",
-        "er\xF2",
-        "ete",
-        "eva",
-        "evi",
-        "evo",
-        "ir\xE0",
-        "ire",
-        "ir\xF2",
-        "ita",
-        "ite",
-        "iti",
-        "ito",
-        "iva",
-        "ivi",
-        "ivo",
-        "ono",
-        "uta",
-        "ute",
-        "uti",
-        "uto",
-        "ar",
-        "ir",
+        'erebbero',
+        'irebbero',
+        'assero',
+        'assimo',
+        'eranno',
+        'erebbe',
+        'eremmo',
+        'ereste',
+        'eresti',
+        'essero',
+        'iranno',
+        'irebbe',
+        'iremmo',
+        'ireste',
+        'iresti',
+        'iscano',
+        'iscono',
+        'issero',
+        'arono',
+        'avamo',
+        'avano',
+        'avate',
+        'eremo',
+        'erete',
+        'erono',
+        'evamo',
+        'evano',
+        'evate',
+        'iremo',
+        'irete',
+        'irono',
+        'ivamo',
+        'ivano',
+        'ivate',
+        'ammo',
+        'ando',
+        'asse',
+        'assi',
+        'emmo',
+        'enda',
+        'ende',
+        'endi',
+        'endo',
+        'erai',
+        'erei',
+        'Yamo',
+        'iamo',
+        'immo',
+        'irai',
+        'irei',
+        'isca',
+        'isce',
+        'isci',
+        'isco',
+        'ano',
+        'are',
+        'ata',
+        'ate',
+        'ati',
+        'ato',
+        'ava',
+        'avi',
+        'avo',
+        'er\xE0',
+        'ere',
+        'er\xF2',
+        'ete',
+        'eva',
+        'evi',
+        'evo',
+        'ir\xE0',
+        'ire',
+        'ir\xF2',
+        'ita',
+        'ite',
+        'iti',
+        'ito',
+        'iva',
+        'ivi',
+        'ivo',
+        'ono',
+        'uta',
+        'ute',
+        'uti',
+        'uto',
+        'ar',
+        'ir',
     )
 
     def stem(self, word):
@@ -3705,175 +3709,175 @@ class PortugueseStemmer(_StandardStemmer):
 
     __vowels = "aeiou\xE1\xE9\xED\xF3\xFA\xE2\xEA\xF4"
     __step1_suffixes = (
-        "amentos",
-        "imentos",
-        "uço~es",
-        "amento",
-        "imento",
-        "adoras",
-        "adores",
-        "a\xE7o~es",
-        "logias",
-        "\xEAncias",
-        "amente",
-        "idades",
-        "an\xE7as",
-        "ismos",
-        "istas",
-        "adora",
-        "a\xE7a~o",
-        "antes",
-        "\xE2ncia",
-        "logia",
-        "uça~o",
-        "\xEAncia",
-        "mente",
-        "idade",
-        "an\xE7a",
-        "ezas",
-        "icos",
-        "icas",
-        "ismo",
-        "\xE1vel",
-        "\xEDvel",
-        "ista",
-        "osos",
-        "osas",
-        "ador",
-        "ante",
-        "ivas",
-        "ivos",
-        "iras",
-        "eza",
-        "ico",
-        "ica",
-        "oso",
-        "osa",
-        "iva",
-        "ivo",
-        "ira",
+        'amentos',
+        'imentos',
+        'uço~es',
+        'amento',
+        'imento',
+        'adoras',
+        'adores',
+        'a\xE7o~es',
+        'logias',
+        '\xEAncias',
+        'amente',
+        'idades',
+        'an\xE7as',
+        'ismos',
+        'istas',
+        'adora',
+        'a\xE7a~o',
+        'antes',
+        '\xE2ncia',
+        'logia',
+        'uça~o',
+        '\xEAncia',
+        'mente',
+        'idade',
+        'an\xE7a',
+        'ezas',
+        'icos',
+        'icas',
+        'ismo',
+        '\xE1vel',
+        '\xEDvel',
+        'ista',
+        'osos',
+        'osas',
+        'ador',
+        'ante',
+        'ivas',
+        'ivos',
+        'iras',
+        'eza',
+        'ico',
+        'ica',
+        'oso',
+        'osa',
+        'iva',
+        'ivo',
+        'ira',
     )
     __step2_suffixes = (
-        "ar\xEDamos",
-        "er\xEDamos",
-        "ir\xEDamos",
-        "\xE1ssemos",
-        "\xEAssemos",
-        "\xEDssemos",
-        "ar\xEDeis",
-        "er\xEDeis",
-        "ir\xEDeis",
-        "\xE1sseis",
-        "\xE9sseis",
-        "\xEDsseis",
-        "\xE1ramos",
-        "\xE9ramos",
-        "\xEDramos",
-        "\xE1vamos",
-        "aremos",
-        "eremos",
-        "iremos",
-        "ariam",
-        "eriam",
-        "iriam",
-        "assem",
-        "essem",
-        "issem",
-        "ara~o",
-        "era~o",
-        "ira~o",
-        "arias",
-        "erias",
-        "irias",
-        "ardes",
-        "erdes",
-        "irdes",
-        "asses",
-        "esses",
-        "isses",
-        "astes",
-        "estes",
-        "istes",
-        "\xE1reis",
-        "areis",
-        "\xE9reis",
-        "ereis",
-        "\xEDreis",
-        "ireis",
-        "\xE1veis",
-        "\xEDamos",
-        "armos",
-        "ermos",
-        "irmos",
-        "aria",
-        "eria",
-        "iria",
-        "asse",
-        "esse",
-        "isse",
-        "aste",
-        "este",
-        "iste",
-        "arei",
-        "erei",
-        "irei",
-        "aram",
-        "eram",
-        "iram",
-        "avam",
-        "arem",
-        "erem",
-        "irem",
-        "ando",
-        "endo",
-        "indo",
-        "adas",
-        "idas",
-        "ar\xE1s",
-        "aras",
-        "er\xE1s",
-        "eras",
-        "ir\xE1s",
-        "avas",
-        "ares",
-        "eres",
-        "ires",
-        "\xEDeis",
-        "ados",
-        "idos",
-        "\xE1mos",
-        "amos",
-        "emos",
-        "imos",
-        "iras",
-        "ada",
-        "ida",
-        "ar\xE1",
-        "ara",
-        "er\xE1",
-        "era",
-        "ir\xE1",
-        "ava",
-        "iam",
-        "ado",
-        "ido",
-        "ias",
-        "ais",
-        "eis",
-        "ira",
-        "ia",
-        "ei",
-        "am",
-        "em",
-        "ar",
-        "er",
-        "ir",
-        "as",
-        "es",
-        "is",
-        "eu",
-        "iu",
-        "ou",
+        'ar\xEDamos',
+        'er\xEDamos',
+        'ir\xEDamos',
+        '\xE1ssemos',
+        '\xEAssemos',
+        '\xEDssemos',
+        'ar\xEDeis',
+        'er\xEDeis',
+        'ir\xEDeis',
+        '\xE1sseis',
+        '\xE9sseis',
+        '\xEDsseis',
+        '\xE1ramos',
+        '\xE9ramos',
+        '\xEDramos',
+        '\xE1vamos',
+        'aremos',
+        'eremos',
+        'iremos',
+        'ariam',
+        'eriam',
+        'iriam',
+        'assem',
+        'essem',
+        'issem',
+        'ara~o',
+        'era~o',
+        'ira~o',
+        'arias',
+        'erias',
+        'irias',
+        'ardes',
+        'erdes',
+        'irdes',
+        'asses',
+        'esses',
+        'isses',
+        'astes',
+        'estes',
+        'istes',
+        '\xE1reis',
+        'areis',
+        '\xE9reis',
+        'ereis',
+        '\xEDreis',
+        'ireis',
+        '\xE1veis',
+        '\xEDamos',
+        'armos',
+        'ermos',
+        'irmos',
+        'aria',
+        'eria',
+        'iria',
+        'asse',
+        'esse',
+        'isse',
+        'aste',
+        'este',
+        'iste',
+        'arei',
+        'erei',
+        'irei',
+        'aram',
+        'eram',
+        'iram',
+        'avam',
+        'arem',
+        'erem',
+        'irem',
+        'ando',
+        'endo',
+        'indo',
+        'adas',
+        'idas',
+        'ar\xE1s',
+        'aras',
+        'er\xE1s',
+        'eras',
+        'ir\xE1s',
+        'avas',
+        'ares',
+        'eres',
+        'ires',
+        '\xEDeis',
+        'ados',
+        'idos',
+        '\xE1mos',
+        'amos',
+        'emos',
+        'imos',
+        'iras',
+        'ada',
+        'ida',
+        'ar\xE1',
+        'ara',
+        'er\xE1',
+        'era',
+        'ir\xE1',
+        'ava',
+        'iam',
+        'ado',
+        'ido',
+        'ias',
+        'ais',
+        'eis',
+        'ira',
+        'ia',
+        'ei',
+        'am',
+        'em',
+        'ar',
+        'er',
+        'ir',
+        'as',
+        'es',
+        'is',
+        'eu',
+        'iu',
+        'ou',
     )
     __step4_suffixes = ("os", "a", "i", "o", "\xE1", "\xED", "\xF3")
 
@@ -4053,230 +4057,230 @@ class RomanianStemmer(_StandardStemmer):
 
     __vowels = "aeiou\u0103\xE2\xEE"
     __step0_suffixes = (
-        "iilor",
-        "ului",
-        "elor",
-        "iile",
-        "ilor",
-        "atei",
-        "a\u0163ie",
-        "a\u0163ia",
-        "aua",
-        "ele",
-        "iua",
-        "iei",
-        "ile",
-        "ul",
-        "ea",
-        "ii",
+        'iilor',
+        'ului',
+        'elor',
+        'iile',
+        'ilor',
+        'atei',
+        'a\u0163ie',
+        'a\u0163ia',
+        'aua',
+        'ele',
+        'iua',
+        'iei',
+        'ile',
+        'ul',
+        'ea',
+        'ii',
     )
     __step1_suffixes = (
-        "abilitate",
-        "abilitati",
-        "abilit\u0103\u0163i",
-        "ibilitate",
-        "abilit\u0103i",
-        "ivitate",
-        "ivitati",
-        "ivit\u0103\u0163i",
-        "icitate",
-        "icitati",
-        "icit\u0103\u0163i",
-        "icatori",
-        "ivit\u0103i",
-        "icit\u0103i",
-        "icator",
-        "a\u0163iune",
-        "atoare",
-        "\u0103toare",
-        "i\u0163iune",
-        "itoare",
-        "iciva",
-        "icive",
-        "icivi",
-        "iciv\u0103",
-        "icala",
-        "icale",
-        "icali",
-        "ical\u0103",
-        "ativa",
-        "ative",
-        "ativi",
-        "ativ\u0103",
-        "atori",
-        "\u0103tori",
-        "itiva",
-        "itive",
-        "itivi",
-        "itiv\u0103",
-        "itori",
-        "iciv",
-        "ical",
-        "ativ",
-        "ator",
-        "\u0103tor",
-        "itiv",
-        "itor",
+        'abilitate',
+        'abilitati',
+        'abilit\u0103\u0163i',
+        'ibilitate',
+        'abilit\u0103i',
+        'ivitate',
+        'ivitati',
+        'ivit\u0103\u0163i',
+        'icitate',
+        'icitati',
+        'icit\u0103\u0163i',
+        'icatori',
+        'ivit\u0103i',
+        'icit\u0103i',
+        'icator',
+        'a\u0163iune',
+        'atoare',
+        '\u0103toare',
+        'i\u0163iune',
+        'itoare',
+        'iciva',
+        'icive',
+        'icivi',
+        'iciv\u0103',
+        'icala',
+        'icale',
+        'icali',
+        'ical\u0103',
+        'ativa',
+        'ative',
+        'ativi',
+        'ativ\u0103',
+        'atori',
+        '\u0103tori',
+        'itiva',
+        'itive',
+        'itivi',
+        'itiv\u0103',
+        'itori',
+        'iciv',
+        'ical',
+        'ativ',
+        'ator',
+        '\u0103tor',
+        'itiv',
+        'itor',
     )
     __step2_suffixes = (
-        "abila",
-        "abile",
-        "abili",
-        "abil\u0103",
-        "ibila",
-        "ibile",
-        "ibili",
-        "ibil\u0103",
-        "atori",
-        "itate",
-        "itati",
-        "it\u0103\u0163i",
-        "abil",
-        "ibil",
-        "oasa",
-        "oas\u0103",
-        "oase",
-        "anta",
-        "ante",
-        "anti",
-        "ant\u0103",
-        "ator",
-        "it\u0103i",
-        "iune",
-        "iuni",
-        "isme",
-        "ista",
-        "iste",
-        "isti",
-        "ist\u0103",
-        "i\u015Fti",
-        "ata",
-        "at\u0103",
-        "ati",
-        "ate",
-        "uta",
-        "ut\u0103",
-        "uti",
-        "ute",
-        "ita",
-        "it\u0103",
-        "iti",
-        "ite",
-        "ica",
-        "ice",
-        "ici",
-        "ic\u0103",
-        "osi",
-        "o\u015Fi",
-        "ant",
-        "iva",
-        "ive",
-        "ivi",
-        "iv\u0103",
-        "ism",
-        "ist",
-        "at",
-        "ut",
-        "it",
-        "ic",
-        "os",
-        "iv",
+        'abila',
+        'abile',
+        'abili',
+        'abil\u0103',
+        'ibila',
+        'ibile',
+        'ibili',
+        'ibil\u0103',
+        'atori',
+        'itate',
+        'itati',
+        'it\u0103\u0163i',
+        'abil',
+        'ibil',
+        'oasa',
+        'oas\u0103',
+        'oase',
+        'anta',
+        'ante',
+        'anti',
+        'ant\u0103',
+        'ator',
+        'it\u0103i',
+        'iune',
+        'iuni',
+        'isme',
+        'ista',
+        'iste',
+        'isti',
+        'ist\u0103',
+        'i\u015Fti',
+        'ata',
+        'at\u0103',
+        'ati',
+        'ate',
+        'uta',
+        'ut\u0103',
+        'uti',
+        'ute',
+        'ita',
+        'it\u0103',
+        'iti',
+        'ite',
+        'ica',
+        'ice',
+        'ici',
+        'ic\u0103',
+        'osi',
+        'o\u015Fi',
+        'ant',
+        'iva',
+        'ive',
+        'ivi',
+        'iv\u0103',
+        'ism',
+        'ist',
+        'at',
+        'ut',
+        'it',
+        'ic',
+        'os',
+        'iv',
     )
     __step3_suffixes = (
-        "seser\u0103\u0163i",
-        "aser\u0103\u0163i",
-        "iser\u0103\u0163i",
-        "\xE2ser\u0103\u0163i",
-        "user\u0103\u0163i",
-        "seser\u0103m",
-        "aser\u0103m",
-        "iser\u0103m",
-        "\xE2ser\u0103m",
-        "user\u0103m",
-        "ser\u0103\u0163i",
-        "sese\u015Fi",
-        "seser\u0103",
-        "easc\u0103",
-        "ar\u0103\u0163i",
-        "ur\u0103\u0163i",
-        "ir\u0103\u0163i",
-        "\xE2r\u0103\u0163i",
-        "ase\u015Fi",
-        "aser\u0103",
-        "ise\u015Fi",
-        "iser\u0103",
-        "\xe2se\u015Fi",
-        "\xE2ser\u0103",
-        "use\u015Fi",
-        "user\u0103",
-        "ser\u0103m",
-        "sesem",
-        "indu",
-        "\xE2ndu",
-        "eaz\u0103",
-        "e\u015Fti",
-        "e\u015Fte",
-        "\u0103\u015Fti",
-        "\u0103\u015Fte",
-        "ea\u0163i",
-        "ia\u0163i",
-        "ar\u0103m",
-        "ur\u0103m",
-        "ir\u0103m",
-        "\xE2r\u0103m",
-        "asem",
-        "isem",
-        "\xE2sem",
-        "usem",
-        "se\u015Fi",
-        "ser\u0103",
-        "sese",
-        "are",
-        "ere",
-        "ire",
-        "\xE2re",
-        "ind",
-        "\xE2nd",
-        "eze",
-        "ezi",
-        "esc",
-        "\u0103sc",
-        "eam",
-        "eai",
-        "eau",
-        "iam",
-        "iai",
-        "iau",
-        "a\u015Fi",
-        "ar\u0103",
-        "u\u015Fi",
-        "ur\u0103",
-        "i\u015Fi",
-        "ir\u0103",
-        "\xE2\u015Fi",
-        "\xe2r\u0103",
-        "ase",
-        "ise",
-        "\xE2se",
-        "use",
-        "a\u0163i",
-        "e\u0163i",
-        "i\u0163i",
-        "\xe2\u0163i",
-        "sei",
-        "ez",
-        "am",
-        "ai",
-        "au",
-        "ea",
-        "ia",
-        "ui",
-        "\xE2i",
-        "\u0103m",
-        "em",
-        "im",
-        "\xE2m",
-        "se",
+        'seser\u0103\u0163i',
+        'aser\u0103\u0163i',
+        'iser\u0103\u0163i',
+        '\xE2ser\u0103\u0163i',
+        'user\u0103\u0163i',
+        'seser\u0103m',
+        'aser\u0103m',
+        'iser\u0103m',
+        '\xE2ser\u0103m',
+        'user\u0103m',
+        'ser\u0103\u0163i',
+        'sese\u015Fi',
+        'seser\u0103',
+        'easc\u0103',
+        'ar\u0103\u0163i',
+        'ur\u0103\u0163i',
+        'ir\u0103\u0163i',
+        '\xE2r\u0103\u0163i',
+        'ase\u015Fi',
+        'aser\u0103',
+        'ise\u015Fi',
+        'iser\u0103',
+        '\xe2se\u015Fi',
+        '\xE2ser\u0103',
+        'use\u015Fi',
+        'user\u0103',
+        'ser\u0103m',
+        'sesem',
+        'indu',
+        '\xE2ndu',
+        'eaz\u0103',
+        'e\u015Fti',
+        'e\u015Fte',
+        '\u0103\u015Fti',
+        '\u0103\u015Fte',
+        'ea\u0163i',
+        'ia\u0163i',
+        'ar\u0103m',
+        'ur\u0103m',
+        'ir\u0103m',
+        '\xE2r\u0103m',
+        'asem',
+        'isem',
+        '\xE2sem',
+        'usem',
+        'se\u015Fi',
+        'ser\u0103',
+        'sese',
+        'are',
+        'ere',
+        'ire',
+        '\xE2re',
+        'ind',
+        '\xE2nd',
+        'eze',
+        'ezi',
+        'esc',
+        '\u0103sc',
+        'eam',
+        'eai',
+        'eau',
+        'iam',
+        'iai',
+        'iau',
+        'a\u015Fi',
+        'ar\u0103',
+        'u\u015Fi',
+        'ur\u0103',
+        'i\u015Fi',
+        'ir\u0103',
+        '\xE2\u015Fi',
+        '\xe2r\u0103',
+        'ase',
+        'ise',
+        '\xE2se',
+        'use',
+        'a\u0163i',
+        'e\u0163i',
+        'i\u0163i',
+        '\xe2\u0163i',
+        'sei',
+        'ez',
+        'am',
+        'ai',
+        'au',
+        'ea',
+        'ia',
+        'ui',
+        '\xE2i',
+        '\u0103m',
+        'em',
+        'im',
+        '\xE2m',
+        'se',
     )
 
     def stem(self, word):
@@ -4470,26 +4474,26 @@ class RomanianStemmer(_StandardStemmer):
                 if word.endswith(suffix):
                     if suffix in rv:
                         if suffix in (
-                            "seser\u0103\u0163i",
-                            "seser\u0103m",
-                            "ser\u0103\u0163i",
-                            "sese\u015Fi",
-                            "seser\u0103",
-                            "ser\u0103m",
-                            "sesem",
-                            "se\u015Fi",
-                            "ser\u0103",
-                            "sese",
-                            "a\u0163i",
-                            "e\u0163i",
-                            "i\u0163i",
-                            "\xE2\u0163i",
-                            "sei",
-                            "\u0103m",
-                            "em",
-                            "im",
-                            "\xE2m",
-                            "se",
+                            'seser\u0103\u0163i',
+                            'seser\u0103m',
+                            'ser\u0103\u0163i',
+                            'sese\u015Fi',
+                            'seser\u0103',
+                            'ser\u0103m',
+                            'sesem',
+                            'se\u015Fi',
+                            'ser\u0103',
+                            'sese',
+                            'a\u0163i',
+                            'e\u0163i',
+                            'i\u0163i',
+                            '\xE2\u0163i',
+                            'sei',
+                            '\u0103m',
+                            'em',
+                            'im',
+                            '\xE2m',
+                            'se',
                         ):
                             word = word[: -len(suffix)]
                             rv = rv[: -len(suffix)]
@@ -4550,326 +4554,326 @@ class RussianStemmer(_LanguageSpecificStemmer):
         "v",
     )
     __adjectival_suffixes = (
-        "ui^ushchi^ui^u",
-        "ui^ushchi^ai^a",
-        "ui^ushchimi",
-        "ui^ushchymi",
-        "ui^ushchego",
-        "ui^ushchogo",
-        "ui^ushchemu",
-        "ui^ushchomu",
-        "ui^ushchikh",
-        "ui^ushchykh",
-        "ui^ushchui^u",
-        "ui^ushchaia",
-        "ui^ushchoi^u",
-        "ui^ushchei^u",
-        "i^ushchi^ui^u",
-        "i^ushchi^ai^a",
-        "ui^ushchee",
-        "ui^ushchie",
-        "ui^ushchye",
-        "ui^ushchoe",
-        "ui^ushchei`",
-        "ui^ushchii`",
-        "ui^ushchyi`",
-        "ui^ushchoi`",
-        "ui^ushchem",
-        "ui^ushchim",
-        "ui^ushchym",
-        "ui^ushchom",
-        "i^ushchimi",
-        "i^ushchymi",
-        "i^ushchego",
-        "i^ushchogo",
-        "i^ushchemu",
-        "i^ushchomu",
-        "i^ushchikh",
-        "i^ushchykh",
-        "i^ushchui^u",
-        "i^ushchai^a",
-        "i^ushchoi^u",
-        "i^ushchei^u",
-        "i^ushchee",
-        "i^ushchie",
-        "i^ushchye",
-        "i^ushchoe",
-        "i^ushchei`",
-        "i^ushchii`",
-        "i^ushchyi`",
-        "i^ushchoi`",
-        "i^ushchem",
-        "i^ushchim",
-        "i^ushchym",
-        "i^ushchom",
-        "shchi^ui^u",
-        "shchi^ai^a",
-        "ivshi^ui^u",
-        "ivshi^ai^a",
-        "yvshi^ui^u",
-        "yvshi^ai^a",
-        "shchimi",
-        "shchymi",
-        "shchego",
-        "shchogo",
-        "shchemu",
-        "shchomu",
-        "shchikh",
-        "shchykh",
-        "shchui^u",
-        "shchai^a",
-        "shchoi^u",
-        "shchei^u",
-        "ivshimi",
-        "ivshymi",
-        "ivshego",
-        "ivshogo",
-        "ivshemu",
-        "ivshomu",
-        "ivshikh",
-        "ivshykh",
-        "ivshui^u",
-        "ivshai^a",
-        "ivshoi^u",
-        "ivshei^u",
-        "yvshimi",
-        "yvshymi",
-        "yvshego",
-        "yvshogo",
-        "yvshemu",
-        "yvshomu",
-        "yvshikh",
-        "yvshykh",
-        "yvshui^u",
-        "yvshai^a",
-        "yvshoi^u",
-        "yvshei^u",
-        "vshi^ui^u",
-        "vshi^ai^a",
-        "shchee",
-        "shchie",
-        "shchye",
-        "shchoe",
-        "shchei`",
-        "shchii`",
-        "shchyi`",
-        "shchoi`",
-        "shchem",
-        "shchim",
-        "shchym",
-        "shchom",
-        "ivshee",
-        "ivshie",
-        "ivshye",
-        "ivshoe",
-        "ivshei`",
-        "ivshii`",
-        "ivshyi`",
-        "ivshoi`",
-        "ivshem",
-        "ivshim",
-        "ivshym",
-        "ivshom",
-        "yvshee",
-        "yvshie",
-        "yvshye",
-        "yvshoe",
-        "yvshei`",
-        "yvshii`",
-        "yvshyi`",
-        "yvshoi`",
-        "yvshem",
-        "yvshim",
-        "yvshym",
-        "yvshom",
-        "vshimi",
-        "vshymi",
-        "vshego",
-        "vshogo",
-        "vshemu",
-        "vshomu",
-        "vshikh",
-        "vshykh",
-        "vshui^u",
-        "vshai^a",
-        "vshoi^u",
-        "vshei^u",
-        "emi^ui^u",
-        "emi^ai^a",
-        "nni^ui^u",
-        "nni^ai^a",
-        "vshee",
-        "vshie",
-        "vshye",
-        "vshoe",
-        "vshei`",
-        "vshii`",
-        "vshyi`",
-        "vshoi`",
-        "vshem",
-        "vshim",
-        "vshym",
-        "vshom",
-        "emimi",
-        "emymi",
-        "emego",
-        "emogo",
-        "ememu",
-        "emomu",
-        "emikh",
-        "emykh",
-        "emui^u",
-        "emai^a",
-        "emoi^u",
-        "emei^u",
-        "nnimi",
-        "nnymi",
-        "nnego",
-        "nnogo",
-        "nnemu",
-        "nnomu",
-        "nnikh",
-        "nnykh",
-        "nnui^u",
-        "nnai^a",
-        "nnoi^u",
-        "nnei^u",
-        "emee",
-        "emie",
-        "emye",
-        "emoe",
-        "emei`",
-        "emii`",
-        "emyi`",
-        "emoi`",
-        "emem",
-        "emim",
-        "emym",
-        "emom",
-        "nnee",
-        "nnie",
-        "nnye",
-        "nnoe",
-        "nnei`",
-        "nnii`",
-        "nnyi`",
-        "nnoi`",
-        "nnem",
-        "nnim",
-        "nnym",
-        "nnom",
-        "i^ui^u",
-        "i^ai^a",
-        "imi",
-        "ymi",
-        "ego",
-        "ogo",
-        "emu",
-        "omu",
-        "ikh",
-        "ykh",
-        "ui^u",
-        "ai^a",
-        "oi^u",
-        "ei^u",
-        "ee",
-        "ie",
-        "ye",
-        "oe",
-        "ei`",
-        "ii`",
-        "yi`",
-        "oi`",
-        "em",
-        "im",
-        "ym",
-        "om",
+        'ui^ushchi^ui^u',
+        'ui^ushchi^ai^a',
+        'ui^ushchimi',
+        'ui^ushchymi',
+        'ui^ushchego',
+        'ui^ushchogo',
+        'ui^ushchemu',
+        'ui^ushchomu',
+        'ui^ushchikh',
+        'ui^ushchykh',
+        'ui^ushchui^u',
+        'ui^ushchaia',
+        'ui^ushchoi^u',
+        'ui^ushchei^u',
+        'i^ushchi^ui^u',
+        'i^ushchi^ai^a',
+        'ui^ushchee',
+        'ui^ushchie',
+        'ui^ushchye',
+        'ui^ushchoe',
+        'ui^ushchei`',
+        'ui^ushchii`',
+        'ui^ushchyi`',
+        'ui^ushchoi`',
+        'ui^ushchem',
+        'ui^ushchim',
+        'ui^ushchym',
+        'ui^ushchom',
+        'i^ushchimi',
+        'i^ushchymi',
+        'i^ushchego',
+        'i^ushchogo',
+        'i^ushchemu',
+        'i^ushchomu',
+        'i^ushchikh',
+        'i^ushchykh',
+        'i^ushchui^u',
+        'i^ushchai^a',
+        'i^ushchoi^u',
+        'i^ushchei^u',
+        'i^ushchee',
+        'i^ushchie',
+        'i^ushchye',
+        'i^ushchoe',
+        'i^ushchei`',
+        'i^ushchii`',
+        'i^ushchyi`',
+        'i^ushchoi`',
+        'i^ushchem',
+        'i^ushchim',
+        'i^ushchym',
+        'i^ushchom',
+        'shchi^ui^u',
+        'shchi^ai^a',
+        'ivshi^ui^u',
+        'ivshi^ai^a',
+        'yvshi^ui^u',
+        'yvshi^ai^a',
+        'shchimi',
+        'shchymi',
+        'shchego',
+        'shchogo',
+        'shchemu',
+        'shchomu',
+        'shchikh',
+        'shchykh',
+        'shchui^u',
+        'shchai^a',
+        'shchoi^u',
+        'shchei^u',
+        'ivshimi',
+        'ivshymi',
+        'ivshego',
+        'ivshogo',
+        'ivshemu',
+        'ivshomu',
+        'ivshikh',
+        'ivshykh',
+        'ivshui^u',
+        'ivshai^a',
+        'ivshoi^u',
+        'ivshei^u',
+        'yvshimi',
+        'yvshymi',
+        'yvshego',
+        'yvshogo',
+        'yvshemu',
+        'yvshomu',
+        'yvshikh',
+        'yvshykh',
+        'yvshui^u',
+        'yvshai^a',
+        'yvshoi^u',
+        'yvshei^u',
+        'vshi^ui^u',
+        'vshi^ai^a',
+        'shchee',
+        'shchie',
+        'shchye',
+        'shchoe',
+        'shchei`',
+        'shchii`',
+        'shchyi`',
+        'shchoi`',
+        'shchem',
+        'shchim',
+        'shchym',
+        'shchom',
+        'ivshee',
+        'ivshie',
+        'ivshye',
+        'ivshoe',
+        'ivshei`',
+        'ivshii`',
+        'ivshyi`',
+        'ivshoi`',
+        'ivshem',
+        'ivshim',
+        'ivshym',
+        'ivshom',
+        'yvshee',
+        'yvshie',
+        'yvshye',
+        'yvshoe',
+        'yvshei`',
+        'yvshii`',
+        'yvshyi`',
+        'yvshoi`',
+        'yvshem',
+        'yvshim',
+        'yvshym',
+        'yvshom',
+        'vshimi',
+        'vshymi',
+        'vshego',
+        'vshogo',
+        'vshemu',
+        'vshomu',
+        'vshikh',
+        'vshykh',
+        'vshui^u',
+        'vshai^a',
+        'vshoi^u',
+        'vshei^u',
+        'emi^ui^u',
+        'emi^ai^a',
+        'nni^ui^u',
+        'nni^ai^a',
+        'vshee',
+        'vshie',
+        'vshye',
+        'vshoe',
+        'vshei`',
+        'vshii`',
+        'vshyi`',
+        'vshoi`',
+        'vshem',
+        'vshim',
+        'vshym',
+        'vshom',
+        'emimi',
+        'emymi',
+        'emego',
+        'emogo',
+        'ememu',
+        'emomu',
+        'emikh',
+        'emykh',
+        'emui^u',
+        'emai^a',
+        'emoi^u',
+        'emei^u',
+        'nnimi',
+        'nnymi',
+        'nnego',
+        'nnogo',
+        'nnemu',
+        'nnomu',
+        'nnikh',
+        'nnykh',
+        'nnui^u',
+        'nnai^a',
+        'nnoi^u',
+        'nnei^u',
+        'emee',
+        'emie',
+        'emye',
+        'emoe',
+        'emei`',
+        'emii`',
+        'emyi`',
+        'emoi`',
+        'emem',
+        'emim',
+        'emym',
+        'emom',
+        'nnee',
+        'nnie',
+        'nnye',
+        'nnoe',
+        'nnei`',
+        'nnii`',
+        'nnyi`',
+        'nnoi`',
+        'nnem',
+        'nnim',
+        'nnym',
+        'nnom',
+        'i^ui^u',
+        'i^ai^a',
+        'imi',
+        'ymi',
+        'ego',
+        'ogo',
+        'emu',
+        'omu',
+        'ikh',
+        'ykh',
+        'ui^u',
+        'ai^a',
+        'oi^u',
+        'ei^u',
+        'ee',
+        'ie',
+        'ye',
+        'oe',
+        'ei`',
+        'ii`',
+        'yi`',
+        'oi`',
+        'em',
+        'im',
+        'ym',
+        'om',
     )
     __reflexive_suffixes = ("si^a", "s'")
     __verb_suffixes = (
         "esh'",
-        "ei`te",
-        "ui`te",
-        "ui^ut",
+        'ei`te',
+        'ui`te',
+        'ui^ut',
         "ish'",
-        "ete",
-        "i`te",
-        "i^ut",
-        "nno",
-        "ila",
-        "yla",
-        "ena",
-        "ite",
-        "ili",
-        "yli",
-        "ilo",
-        "ylo",
-        "eno",
-        "i^at",
-        "uet",
-        "eny",
+        'ete',
+        'i`te',
+        'i^ut',
+        'nno',
+        'ila',
+        'yla',
+        'ena',
+        'ite',
+        'ili',
+        'yli',
+        'ilo',
+        'ylo',
+        'eno',
+        'i^at',
+        'uet',
+        'eny',
         "it'",
         "yt'",
-        "ui^u",
-        "la",
-        "na",
-        "li",
-        "em",
-        "lo",
-        "no",
-        "et",
-        "ny",
+        'ui^u',
+        'la',
+        'na',
+        'li',
+        'em',
+        'lo',
+        'no',
+        'et',
+        'ny',
         "t'",
-        "ei`",
-        "ui`",
-        "il",
-        "yl",
-        "im",
-        "ym",
-        "en",
-        "it",
-        "yt",
-        "i^u",
-        "i`",
-        "l",
-        "n",
+        'ei`',
+        'ui`',
+        'il',
+        'yl',
+        'im',
+        'ym',
+        'en',
+        'it',
+        'yt',
+        'i^u',
+        'i`',
+        'l',
+        'n',
     )
     __noun_suffixes = (
-        "ii^ami",
-        "ii^akh",
-        "i^ami",
-        "ii^am",
-        "i^akh",
-        "ami",
-        "iei`",
-        "i^am",
-        "iem",
-        "akh",
-        "ii^u",
+        'ii^ami',
+        'ii^akh',
+        'i^ami',
+        'ii^am',
+        'i^akh',
+        'ami',
+        'iei`',
+        'i^am',
+        'iem',
+        'akh',
+        'ii^u',
         "'i^u",
-        "ii^a",
+        'ii^a',
         "'i^a",
-        "ev",
-        "ov",
-        "ie",
+        'ev',
+        'ov',
+        'ie',
         "'e",
-        "ei",
-        "ii",
-        "ei`",
-        "oi`",
-        "ii`",
-        "em",
-        "am",
-        "om",
-        "i^u",
-        "i^a",
-        "a",
-        "e",
-        "i",
-        "i`",
-        "o",
-        "u",
-        "y",
+        'ei',
+        'ii',
+        'ei`',
+        'oi`',
+        'ii`',
+        'em',
+        'am',
+        'om',
+        'i^u',
+        'i^a',
+        'a',
+        'e',
+        'i',
+        'i`',
+        'o',
+        'u',
+        'y',
         "'",
     )
     __superlative_suffixes = ("ei`she", "ei`sh")
@@ -4894,10 +4898,8 @@ class RussianStemmer(_LanguageSpecificStemmer):
                 chr_exceeded = True
                 break
 
-        if not chr_exceeded:
-            return word
-
-        word = self.__cyrillic_to_roman(word)
+        if chr_exceeded:
+            word = self.__cyrillic_to_roman(word)
 
         step1_success = False
         adjectival_removed = False
@@ -4938,136 +4940,136 @@ class RussianStemmer(_LanguageSpecificStemmer):
             for suffix in self.__adjectival_suffixes:
                 if rv.endswith(suffix):
                     if suffix in (
-                        "i^ushchi^ui^u",
-                        "i^ushchi^ai^a",
-                        "i^ushchui^u",
-                        "i^ushchai^a",
-                        "i^ushchoi^u",
-                        "i^ushchei^u",
-                        "i^ushchimi",
-                        "i^ushchymi",
-                        "i^ushchego",
-                        "i^ushchogo",
-                        "i^ushchemu",
-                        "i^ushchomu",
-                        "i^ushchikh",
-                        "i^ushchykh",
-                        "shchi^ui^u",
-                        "shchi^ai^a",
-                        "i^ushchee",
-                        "i^ushchie",
-                        "i^ushchye",
-                        "i^ushchoe",
-                        "i^ushchei`",
-                        "i^ushchii`",
-                        "i^ushchyi`",
-                        "i^ushchoi`",
-                        "i^ushchem",
-                        "i^ushchim",
-                        "i^ushchym",
-                        "i^ushchom",
-                        "vshi^ui^u",
-                        "vshi^ai^a",
-                        "shchui^u",
-                        "shchai^a",
-                        "shchoi^u",
-                        "shchei^u",
-                        "emi^ui^u",
-                        "emi^ai^a",
-                        "nni^ui^u",
-                        "nni^ai^a",
-                        "shchimi",
-                        "shchymi",
-                        "shchego",
-                        "shchogo",
-                        "shchemu",
-                        "shchomu",
-                        "shchikh",
-                        "shchykh",
-                        "vshui^u",
-                        "vshai^a",
-                        "vshoi^u",
-                        "vshei^u",
-                        "shchee",
-                        "shchie",
-                        "shchye",
-                        "shchoe",
-                        "shchei`",
-                        "shchii`",
-                        "shchyi`",
-                        "shchoi`",
-                        "shchem",
-                        "shchim",
-                        "shchym",
-                        "shchom",
-                        "vshimi",
-                        "vshymi",
-                        "vshego",
-                        "vshogo",
-                        "vshemu",
-                        "vshomu",
-                        "vshikh",
-                        "vshykh",
-                        "emui^u",
-                        "emai^a",
-                        "emoi^u",
-                        "emei^u",
-                        "nnui^u",
-                        "nnai^a",
-                        "nnoi^u",
-                        "nnei^u",
-                        "vshee",
-                        "vshie",
-                        "vshye",
-                        "vshoe",
-                        "vshei`",
-                        "vshii`",
-                        "vshyi`",
-                        "vshoi`",
-                        "vshem",
-                        "vshim",
-                        "vshym",
-                        "vshom",
-                        "emimi",
-                        "emymi",
-                        "emego",
-                        "emogo",
-                        "ememu",
-                        "emomu",
-                        "emikh",
-                        "emykh",
-                        "nnimi",
-                        "nnymi",
-                        "nnego",
-                        "nnogo",
-                        "nnemu",
-                        "nnomu",
-                        "nnikh",
-                        "nnykh",
-                        "emee",
-                        "emie",
-                        "emye",
-                        "emoe",
-                        "emei`",
-                        "emii`",
-                        "emyi`",
-                        "emoi`",
-                        "emem",
-                        "emim",
-                        "emym",
-                        "emom",
-                        "nnee",
-                        "nnie",
-                        "nnye",
-                        "nnoe",
-                        "nnei`",
-                        "nnii`",
-                        "nnyi`",
-                        "nnoi`",
-                        "nnem",
-                        "nnim",
-                        "nnym",
-                        "nnom",
+                        'i^ushchi^ui^u',
+                        'i^ushchi^ai^a',
+                        'i^ushchui^u',
+                        'i^ushchai^a',
+                        'i^ushchoi^u',
+                        'i^ushchei^u',
+                        'i^ushchimi',
+                        'i^ushchymi',
+                        'i^ushchego',
+                        'i^ushchogo',
+                        'i^ushchemu',
+                        'i^ushchomu',
+                        'i^ushchikh',
+                        'i^ushchykh',
+                        'shchi^ui^u',
+                        'shchi^ai^a',
+                        'i^ushchee',
+                        'i^ushchie',
+                        'i^ushchye',
+                        'i^ushchoe',
+                        'i^ushchei`',
+                        'i^ushchii`',
+                        'i^ushchyi`',
+                        'i^ushchoi`',
+                        'i^ushchem',
+                        'i^ushchim',
+                        'i^ushchym',
+                        'i^ushchom',
+                        'vshi^ui^u',
+                        'vshi^ai^a',
+                        'shchui^u',
+                        'shchai^a',
+                        'shchoi^u',
+                        'shchei^u',
+                        'emi^ui^u',
+                        'emi^ai^a',
+                        'nni^ui^u',
+                        'nni^ai^a',
+                        'shchimi',
+                        'shchymi',
+                        'shchego',
+                        'shchogo',
+                        'shchemu',
+                        'shchomu',
+                        'shchikh',
+                        'shchykh',
+                        'vshui^u',
+                        'vshai^a',
+                        'vshoi^u',
+                        'vshei^u',
+                        'shchee',
+                        'shchie',
+                        'shchye',
+                        'shchoe',
+                        'shchei`',
+                        'shchii`',
+                        'shchyi`',
+                        'shchoi`',
+                        'shchem',
+                        'shchim',
+                        'shchym',
+                        'shchom',
+                        'vshimi',
+                        'vshymi',
+                        'vshego',
+                        'vshogo',
+                        'vshemu',
+                        'vshomu',
+                        'vshikh',
+                        'vshykh',
+                        'emui^u',
+                        'emai^a',
+                        'emoi^u',
+                        'emei^u',
+                        'nnui^u',
+                        'nnai^a',
+                        'nnoi^u',
+                        'nnei^u',
+                        'vshee',
+                        'vshie',
+                        'vshye',
+                        'vshoe',
+                        'vshei`',
+                        'vshii`',
+                        'vshyi`',
+                        'vshoi`',
+                        'vshem',
+                        'vshim',
+                        'vshym',
+                        'vshom',
+                        'emimi',
+                        'emymi',
+                        'emego',
+                        'emogo',
+                        'ememu',
+                        'emomu',
+                        'emikh',
+                        'emykh',
+                        'nnimi',
+                        'nnymi',
+                        'nnego',
+                        'nnogo',
+                        'nnemu',
+                        'nnomu',
+                        'nnikh',
+                        'nnykh',
+                        'emee',
+                        'emie',
+                        'emye',
+                        'emoe',
+                        'emei`',
+                        'emii`',
+                        'emyi`',
+                        'emoi`',
+                        'emem',
+                        'emim',
+                        'emym',
+                        'emom',
+                        'nnee',
+                        'nnie',
+                        'nnye',
+                        'nnoe',
+                        'nnei`',
+                        'nnii`',
+                        'nnyi`',
+                        'nnoi`',
+                        'nnem',
+                        'nnim',
+                        'nnym',
+                        'nnom',
                     ):
                         if (
                             rv[-len(suffix) - 3 : -len(suffix)] == "i^a"
@@ -5160,7 +5162,8 @@ class RussianStemmer(_LanguageSpecificStemmer):
             if word.endswith("'"):
                 word = word[:-1]
 
-        word = self.__roman_to_cyrillic(word)
+        if chr_exceeded:
+            word = self.__roman_to_cyrillic(word)
 
         return word
 
@@ -5394,164 +5397,164 @@ class SpanishStemmer(_StandardStemmer):
         "lo",
     )
     __step1_suffixes = (
-        "amientos",
-        "imientos",
-        "amiento",
-        "imiento",
-        "aciones",
-        "uciones",
-        "adoras",
-        "adores",
-        "ancias",
-        "log\xEDas",
-        "encias",
-        "amente",
-        "idades",
-        "anzas",
-        "ismos",
-        "ables",
-        "ibles",
-        "istas",
-        "adora",
-        "aci\xF3n",
-        "antes",
-        "ancia",
-        "log\xEDa",
-        "uci\xf3n",
-        "encia",
-        "mente",
-        "anza",
-        "icos",
-        "icas",
-        "ismo",
-        "able",
-        "ible",
-        "ista",
-        "osos",
-        "osas",
-        "ador",
-        "ante",
-        "idad",
-        "ivas",
-        "ivos",
-        "ico",
-        "ica",
-        "oso",
-        "osa",
-        "iva",
-        "ivo",
+        'amientos',
+        'imientos',
+        'amiento',
+        'imiento',
+        'aciones',
+        'uciones',
+        'adoras',
+        'adores',
+        'ancias',
+        'log\xEDas',
+        'encias',
+        'amente',
+        'idades',
+        'anzas',
+        'ismos',
+        'ables',
+        'ibles',
+        'istas',
+        'adora',
+        'aci\xF3n',
+        'antes',
+        'ancia',
+        'log\xEDa',
+        'uci\xf3n',
+        'encia',
+        'mente',
+        'anza',
+        'icos',
+        'icas',
+        'ismo',
+        'able',
+        'ible',
+        'ista',
+        'osos',
+        'osas',
+        'ador',
+        'ante',
+        'idad',
+        'ivas',
+        'ivos',
+        'ico',
+        'ica',
+        'oso',
+        'osa',
+        'iva',
+        'ivo',
     )
     __step2a_suffixes = (
-        "yeron",
-        "yendo",
-        "yamos",
-        "yais",
-        "yan",
-        "yen",
-        "yas",
-        "yes",
-        "ya",
-        "ye",
-        "yo",
-        "y\xF3",
+        'yeron',
+        'yendo',
+        'yamos',
+        'yais',
+        'yan',
+        'yen',
+        'yas',
+        'yes',
+        'ya',
+        'ye',
+        'yo',
+        'y\xF3',
     )
     __step2b_suffixes = (
-        "ar\xEDamos",
-        "er\xEDamos",
-        "ir\xEDamos",
-        "i\xE9ramos",
-        "i\xE9semos",
-        "ar\xEDais",
-        "aremos",
-        "er\xEDais",
-        "eremos",
-        "ir\xEDais",
-        "iremos",
-        "ierais",
-        "ieseis",
-        "asteis",
-        "isteis",
-        "\xE1bamos",
-        "\xE1ramos",
-        "\xE1semos",
-        "ar\xEDan",
-        "ar\xEDas",
-        "ar\xE9is",
-        "er\xEDan",
-        "er\xEDas",
-        "er\xE9is",
-        "ir\xEDan",
-        "ir\xEDas",
-        "ir\xE9is",
-        "ieran",
-        "iesen",
-        "ieron",
-        "iendo",
-        "ieras",
-        "ieses",
-        "abais",
-        "arais",
-        "aseis",
-        "\xE9amos",
-        "ar\xE1n",
-        "ar\xE1s",
-        "ar\xEDa",
-        "er\xE1n",
-        "er\xE1s",
-        "er\xEDa",
-        "ir\xE1n",
-        "ir\xE1s",
-        "ir\xEDa",
-        "iera",
-        "iese",
-        "aste",
-        "iste",
-        "aban",
-        "aran",
-        "asen",
-        "aron",
-        "ando",
-        "abas",
-        "adas",
-        "idas",
-        "aras",
-        "ases",
-        "\xEDais",
-        "ados",
-        "idos",
-        "amos",
-        "imos",
-        "emos",
-        "ar\xE1",
-        "ar\xE9",
-        "er\xE1",
-        "er\xE9",
-        "ir\xE1",
-        "ir\xE9",
-        "aba",
-        "ada",
-        "ida",
-        "ara",
-        "ase",
-        "\xEDan",
-        "ado",
-        "ido",
-        "\xEDas",
-        "\xE1is",
-        "\xE9is",
-        "\xEDa",
-        "ad",
-        "ed",
-        "id",
-        "an",
-        "i\xF3",
-        "ar",
-        "er",
-        "ir",
-        "as",
-        "\xEDs",
-        "en",
-        "es",
+        'ar\xEDamos',
+        'er\xEDamos',
+        'ir\xEDamos',
+        'i\xE9ramos',
+        'i\xE9semos',
+        'ar\xEDais',
+        'aremos',
+        'er\xEDais',
+        'eremos',
+        'ir\xEDais',
+        'iremos',
+        'ierais',
+        'ieseis',
+        'asteis',
+        'isteis',
+        '\xE1bamos',
+        '\xE1ramos',
+        '\xE1semos',
+        'ar\xEDan',
+        'ar\xEDas',
+        'ar\xE9is',
+        'er\xEDan',
+        'er\xEDas',
+        'er\xE9is',
+        'ir\xEDan',
+        'ir\xEDas',
+        'ir\xE9is',
+        'ieran',
+        'iesen',
+        'ieron',
+        'iendo',
+        'ieras',
+        'ieses',
+        'abais',
+        'arais',
+        'aseis',
+        '\xE9amos',
+        'ar\xE1n',
+        'ar\xE1s',
+        'ar\xEDa',
+        'er\xE1n',
+        'er\xE1s',
+        'er\xEDa',
+        'ir\xE1n',
+        'ir\xE1s',
+        'ir\xEDa',
+        'iera',
+        'iese',
+        'aste',
+        'iste',
+        'aban',
+        'aran',
+        'asen',
+        'aron',
+        'ando',
+        'abas',
+        'adas',
+        'idas',
+        'aras',
+        'ases',
+        '\xEDais',
+        'ados',
+        'idos',
+        'amos',
+        'imos',
+        'emos',
+        'ar\xE1',
+        'ar\xE9',
+        'er\xE1',
+        'er\xE9',
+        'ir\xE1',
+        'ir\xE9',
+        'aba',
+        'ada',
+        'ida',
+        'ara',
+        'ase',
+        '\xEDan',
+        'ado',
+        'ido',
+        '\xEDas',
+        '\xE1is',
+        '\xE9is',
+        '\xEDa',
+        'ad',
+        'ed',
+        'id',
+        'an',
+        'i\xF3',
+        'ar',
+        'er',
+        'ir',
+        'as',
+        '\xEDs',
+        'en',
+        'es',
     )
     __step3_suffixes = ("os", "a", "e", "o", "\xE1", "\xE9", "\xED", "\xF3")
 
@@ -5875,6 +5878,7 @@ def demo():
 
     """
 
+    import re
     from nltk.corpus import udhr
 
     udhr_corpus = {
@@ -5927,16 +5931,16 @@ def demo():
         excerpt = udhr.words(udhr_corpus[language])[:300]
 
         stemmed = " ".join(stemmer.stem(word) for word in excerpt)
-        stemmed = re.sub(r"(.{,70})\s", r"\1\n", stemmed + " ").rstrip()
+        stemmed = re.sub(r"(.{,70})\s", r'\1\n', stemmed + ' ').rstrip()
         excerpt = " ".join(excerpt)
-        excerpt = re.sub(r"(.{,70})\s", r"\1\n", excerpt + " ").rstrip()
+        excerpt = re.sub(r"(.{,70})\s", r'\1\n', excerpt + ' ').rstrip()
 
         print("\n")
-        print("-" * 70)
-        print("ORIGINAL".center(70))
+        print('-' * 70)
+        print('ORIGINAL'.center(70))
         print(excerpt)
         print("\n\n")
-        print("STEMMED RESULTS".center(70))
+        print('STEMMED RESULTS'.center(70))
         print(stemmed)
-        print("-" * 70)
+        print('-' * 70)
         print("\n")
index eec97bd..0daad9d 100644 (file)
@@ -1,6 +1,6 @@
 # Natural Language Toolkit: Stemmer Utilities
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Helder <he7d3r@gmail.com>
 # URL: <http://nltk.org/>
 # For license information, see LICENSE.TXT
index 33fe049..da521a3 100644 (file)
@@ -1,15 +1,18 @@
 # Natural Language Toolkit: WordNet stemmer interface
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Steven Bird <stevenbird1@gmail.com>
 #         Edward Loper <edloper@gmail.com>
 # URL: <http://nltk.org/>
 # For license information, see LICENSE.TXT
+from __future__ import unicode_literals
 
 from nltk.corpus.reader.wordnet import NOUN
 from nltk.corpus import wordnet
+from nltk.compat import python_2_unicode_compatible
 
 
+@python_2_unicode_compatible
 class WordNetLemmatizer(object):
     """
     WordNet Lemmatizer
@@ -39,7 +42,7 @@ class WordNetLemmatizer(object):
         return min(lemmas, key=len) if lemmas else word
 
     def __repr__(self):
-        return "<WordNetLemmatizer>"
+        return '<WordNetLemmatizer>'
 
 
 # unload wordnet
index 7e6d272..9381205 100644 (file)
@@ -1,7 +1,7 @@
 # -*- coding: utf-8 -*-
 # Natural Language Toolkit: Taggers
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Edward Loper <edloper@gmail.com>
 #         Steven Bird <stevenbird1@gmail.com> (minor additions)
 # URL: <http://nltk.org/>
@@ -63,6 +63,7 @@ We evaluate a tagger on data that was not seen during training:
 
 For more information, please consult chapter 5 of the NLTK Book.
 """
+from __future__ import print_function
 
 from nltk.tag.api import TaggerI
 from nltk.tag.util import str2tuple, tuple2str, untag
@@ -93,14 +94,14 @@ from nltk.tag.perceptron import PerceptronTagger
 from nltk.data import load, find
 
 RUS_PICKLE = (
-    "taggers/averaged_perceptron_tagger_ru/averaged_perceptron_tagger_ru.pickle"
+    'taggers/averaged_perceptron_tagger_ru/averaged_perceptron_tagger_ru.pickle'
 )
 
 
 def _get_tagger(lang=None):
-    if lang == "rus":
+    if lang == 'rus':
         tagger = PerceptronTagger(False)
-        ap_russian_model_loc = "file:" + str(find(RUS_PICKLE))
+        ap_russian_model_loc = 'file:' + str(find(RUS_PICKLE))
         tagger.load(ap_russian_model_loc)
     else:
         tagger = PerceptronTagger()
@@ -109,7 +110,7 @@ def _get_tagger(lang=None):
 
 def _pos_tag(tokens, tagset=None, tagger=None, lang=None):
     # Currently only supoorts English and Russian.
-    if lang not in ["eng", "rus"]:
+    if lang not in ['eng', 'rus']:
         raise NotImplementedError(
             "Currently, NLTK pos_tag only supports English and Russian "
             "(i.e. lang='eng' or lang='rus')"
@@ -117,22 +118,22 @@ def _pos_tag(tokens, tagset=None, tagger=None, lang=None):
     else:
         tagged_tokens = tagger.tag(tokens)
         if tagset:  # Maps to the specified tagset.
-            if lang == "eng":
+            if lang == 'eng':
                 tagged_tokens = [
-                    (token, map_tag("en-ptb", tagset, tag))
+                    (token, map_tag('en-ptb', tagset, tag))
                     for (token, tag) in tagged_tokens
                 ]
-            elif lang == "rus":
+            elif lang == 'rus':
                 # Note that the new Russion pos tags from the model contains suffixes,
                 # see https://github.com/nltk/nltk/issues/2151#issuecomment-430709018
                 tagged_tokens = [
-                    (token, map_tag("ru-rnc-new", tagset, tag.partition("=")[0]))
+                    (token, map_tag('ru-rnc-new', tagset, tag.partition('=')[0]))
                     for (token, tag) in tagged_tokens
                 ]
         return tagged_tokens
 
 
-def pos_tag(tokens, tagset=None, lang="eng"):
+def pos_tag(tokens, tagset=None, lang='eng'):
     """
     Use NLTK's currently recommended part of speech tagger to
     tag the given list of tokens.
@@ -161,13 +162,13 @@ def pos_tag(tokens, tagset=None, lang="eng"):
     return _pos_tag(tokens, tagset, tagger, lang)
 
 
-def pos_tag_sents(sentences, tagset=None, lang="eng"):
+def pos_tag_sents(sentences, tagset=None, lang='eng'):
     """
     Use NLTK's currently recommended part of speech tagger to tag the
     given list of sentences, each consisting of a list of tokens.
 
-    :param sentences: List of sentences to be tagged
-    :type sentences: list(list(str))
+    :param tokens: List of sentences to be tagged
+    :type tokens: list(list(str))
     :param tagset: the tagset to be used, e.g. universal, wsj, brown
     :type tagset: str
     :param lang: the ISO 639 code of the language, e.g. 'eng' for English, 'rus' for Russian
@@ -176,4 +177,4 @@ def pos_tag_sents(sentences, tagset=None, lang="eng"):
     :rtype: list(list(tuple(str, str)))
     """
     tagger = _get_tagger(lang)
-    return [_pos_tag(sent, tagset, tagger, lang) for sent in sentences]
+    return [_pos_tag(sent, tagset, tagger) for sent in sentences]
index 45da18c..bfd2eea 100644 (file)
Binary files a/nlp_resource_data/nltk/tag/__pycache__/__init__.cpython-37.pyc and b/nlp_resource_data/nltk/tag/__pycache__/__init__.cpython-37.pyc differ
index c683081..f3a62a1 100644 (file)
Binary files a/nlp_resource_data/nltk/tag/__pycache__/api.cpython-37.pyc and b/nlp_resource_data/nltk/tag/__pycache__/api.cpython-37.pyc differ
index 1ccd1f8..10bc513 100644 (file)
Binary files a/nlp_resource_data/nltk/tag/__pycache__/brill.cpython-37.pyc and b/nlp_resource_data/nltk/tag/__pycache__/brill.cpython-37.pyc differ
index cb58482..e40d4ed 100644 (file)
Binary files a/nlp_resource_data/nltk/tag/__pycache__/brill_trainer.cpython-37.pyc and b/nlp_resource_data/nltk/tag/__pycache__/brill_trainer.cpython-37.pyc differ
index 9b51dc0..41c152a 100644 (file)
Binary files a/nlp_resource_data/nltk/tag/__pycache__/crf.cpython-37.pyc and b/nlp_resource_data/nltk/tag/__pycache__/crf.cpython-37.pyc differ
index 2123978..dd7178e 100644 (file)
Binary files a/nlp_resource_data/nltk/tag/__pycache__/hmm.cpython-37.pyc and b/nlp_resource_data/nltk/tag/__pycache__/hmm.cpython-37.pyc differ
index af03b86..5cfd6e5 100644 (file)
Binary files a/nlp_resource_data/nltk/tag/__pycache__/hunpos.cpython-37.pyc and b/nlp_resource_data/nltk/tag/__pycache__/hunpos.cpython-37.pyc differ
index a2feb16..29fb734 100644 (file)
Binary files a/nlp_resource_data/nltk/tag/__pycache__/mapping.cpython-37.pyc and b/nlp_resource_data/nltk/tag/__pycache__/mapping.cpython-37.pyc differ
index 962a5df..b718b89 100644 (file)
Binary files a/nlp_resource_data/nltk/tag/__pycache__/perceptron.cpython-37.pyc and b/nlp_resource_data/nltk/tag/__pycache__/perceptron.cpython-37.pyc differ
index 2c9238e..0726c46 100644 (file)
Binary files a/nlp_resource_data/nltk/tag/__pycache__/senna.cpython-37.pyc and b/nlp_resource_data/nltk/tag/__pycache__/senna.cpython-37.pyc differ
index d808ebf..05dcf32 100644 (file)
Binary files a/nlp_resource_data/nltk/tag/__pycache__/sequential.cpython-37.pyc and b/nlp_resource_data/nltk/tag/__pycache__/sequential.cpython-37.pyc differ
index f85ec79..016ccda 100644 (file)
Binary files a/nlp_resource_data/nltk/tag/__pycache__/stanford.cpython-37.pyc and b/nlp_resource_data/nltk/tag/__pycache__/stanford.cpython-37.pyc differ
index e66f53c..cc0cb1a 100644 (file)
Binary files a/nlp_resource_data/nltk/tag/__pycache__/tnt.cpython-37.pyc and b/nlp_resource_data/nltk/tag/__pycache__/tnt.cpython-37.pyc differ
index 48708c0..1366e57 100644 (file)
Binary files a/nlp_resource_data/nltk/tag/__pycache__/util.cpython-37.pyc and b/nlp_resource_data/nltk/tag/__pycache__/util.cpython-37.pyc differ
index c72fb03..0d4ffda 100644 (file)
@@ -1,6 +1,6 @@
 # Natural Language Toolkit: Tagger Interface
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Edward Loper <edloper@gmail.com>
 #         Steven Bird <stevenbird1@gmail.com> (minor additions)
 # URL: <http://nltk.org/>
@@ -13,13 +13,15 @@ information, such as its part of speech.
 from abc import ABCMeta, abstractmethod
 from itertools import chain
 
+from six import add_metaclass
 
 from nltk.internals import overridden
 from nltk.metrics import accuracy
 from nltk.tag.util import untag
 
 
-class TaggerI(metaclass=ABCMeta):
+@add_metaclass(ABCMeta)
+class TaggerI(object):
     """
     A processing interface for assigning a tag to each token in a list.
     Tags are case sensitive strings that identify some property of each
@@ -72,7 +74,7 @@ class TaggerI(metaclass=ABCMeta):
 
     def _check_params(self, train, model):
         if (train and model) or (not train and not model):
-            raise ValueError("Must specify either training data or trained model.")
+            raise ValueError('Must specify either training data or trained model.')
 
 
 class FeaturesetTaggerI(TaggerI):
index fe280a3..b44e335 100644 (file)
@@ -1,13 +1,15 @@
 # -*- coding: utf-8 -*-
 # Natural Language Toolkit: Transformation-based learning
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Marcus Uneson <marcus.uneson@gmail.com>
 #   based on previous (nltk2) version by
 #   Christopher Maloof, Edward Loper, Steven Bird
 # URL: <http://nltk.org/>
 # For license information, see  LICENSE.TXT
 
+from __future__ import print_function, division
+
 from collections import defaultdict, Counter
 
 from nltk.tag import TaggerI
@@ -26,7 +28,7 @@ class Word(Feature):
     Feature which examines the text (word) of nearby tokens.
     """
 
-    json_tag = "nltk.tag.brill.Word"
+    json_tag = 'nltk.tag.brill.Word'
 
     @staticmethod
     def extract_property(tokens, index):
@@ -40,7 +42,7 @@ class Pos(Feature):
     Feature which examines the tags of nearby tokens.
     """
 
-    json_tag = "nltk.tag.brill.Pos"
+    json_tag = 'nltk.tag.brill.Pos'
 
     @staticmethod
     def extract_property(tokens, index):
@@ -204,7 +206,7 @@ class BrillTagger(TaggerI):
     of the TaggerTrainers available.
     """
 
-    json_tag = "nltk.tag.BrillTagger"
+    json_tag = 'nltk.tag.BrillTagger'
 
     def __init__(self, initial_tagger, rules, training_stats=None):
         """
@@ -304,7 +306,7 @@ class BrillTagger(TaggerI):
         tids = [r.templateid for r in self._rules]
         train_stats = self.train_stats()
 
-        trainscores = train_stats["rulescores"]
+        trainscores = train_stats['rulescores']
         assert len(trainscores) == len(tids), (
             "corrupt statistics: "
             "{0} train scores for {1} rules".format(trainscores, tids)
@@ -349,7 +351,7 @@ class BrillTagger(TaggerI):
                 print(s)
 
         def print_testtrain_stats():
-            testscores = test_stats["rulescores"]
+            testscores = test_stats['rulescores']
             print(
                 "TEMPLATE STATISTICS (TEST AND TRAIN) ({0} templates, {1} rules)".format(
                     len(template_counts), len(tids)
@@ -427,24 +429,24 @@ class BrillTagger(TaggerI):
             return sum(t[1] != g[1] for pair in zip(xs, gold) for (t, g) in zip(*pair))
 
         testing_stats = {}
-        testing_stats["tokencount"] = sum(len(t) for t in sequences)
-        testing_stats["sequencecount"] = len(sequences)
+        testing_stats['tokencount'] = sum(len(t) for t in sequences)
+        testing_stats['sequencecount'] = len(sequences)
         tagged_tokenses = [self._initial_tagger.tag(tokens) for tokens in sequences]
-        testing_stats["initialerrors"] = counterrors(tagged_tokenses)
-        testing_stats["initialacc"] = (
-            1 - testing_stats["initialerrors"] / testing_stats["tokencount"]
+        testing_stats['initialerrors'] = counterrors(tagged_tokenses)
+        testing_stats['initialacc'] = (
+            1 - testing_stats['initialerrors'] / testing_stats['tokencount']
         )
         # Apply each rule to the entire corpus, in order
-        errors = [testing_stats["initialerrors"]]
+        errors = [testing_stats['initialerrors']]
         for rule in self._rules:
             for tagged_tokens in tagged_tokenses:
                 rule.apply(tagged_tokens)
             errors.append(counterrors(tagged_tokenses))
-        testing_stats["rulescores"] = [
+        testing_stats['rulescores'] = [
             err0 - err1 for (err0, err1) in zip(errors, errors[1:])
         ]
-        testing_stats["finalerrors"] = errors[-1]
-        testing_stats["finalacc"] = (
-            1 - testing_stats["finalerrors"] / testing_stats["tokencount"]
+        testing_stats['finalerrors'] = errors[-1]
+        testing_stats['finalacc'] = (
+            1 - testing_stats['finalerrors'] / testing_stats['tokencount']
         )
         return (tagged_tokenses, testing_stats)
index b284a03..f518dcf 100644 (file)
@@ -8,6 +8,8 @@
 # URL: <http://nltk.org/>
 # For license information, see  LICENSE.TXT
 
+from __future__ import print_function, division
+
 import bisect
 import textwrap
 from collections import defaultdict
@@ -251,19 +253,19 @@ class BrillTaggerTrainer(object):
 
         # Collect some statistics on the training process
         trainstats = {}
-        trainstats["min_acc"] = min_acc
-        trainstats["min_score"] = min_score
-        trainstats["tokencount"] = sum(len(t) for t in test_sents)
-        trainstats["sequencecount"] = len(test_sents)
-        trainstats["templatecount"] = len(self._templates)
-        trainstats["rulescores"] = []
-        trainstats["initialerrors"] = sum(
+        trainstats['min_acc'] = min_acc
+        trainstats['min_score'] = min_score
+        trainstats['tokencount'] = sum(len(t) for t in test_sents)
+        trainstats['sequencecount'] = len(test_sents)
+        trainstats['templatecount'] = len(self._templates)
+        trainstats['rulescores'] = []
+        trainstats['initialerrors'] = sum(
             tag[1] != truth[1]
             for paired in zip(test_sents, train_sents)
             for (tag, truth) in zip(*paired)
         )
-        trainstats["initialacc"] = (
-            1 - trainstats["initialerrors"] / trainstats["tokencount"]
+        trainstats['initialacc'] = (
+            1 - trainstats['initialerrors'] / trainstats['tokencount']
         )
         if self._trace > 0:
             print(
@@ -280,7 +282,7 @@ class BrillTaggerTrainer(object):
             print("Finding initial useful rules...")
         self._init_mappings(test_sents, train_sents)
         if self._trace:
-            print(("    Found {} useful rules.".format(len(self._rule_scores))))
+            print(("    Found %d useful rules." % len(self._rule_scores)))
 
         # Let the user know what we're up to.
         if self._trace > 2:
@@ -297,7 +299,7 @@ class BrillTaggerTrainer(object):
                 if rule:
                     rules.append(rule)
                     score = self._rule_scores[rule]
-                    trainstats["rulescores"].append(score)
+                    trainstats['rulescores'].append(score)
                 else:
                     break  # No more good rules left!
 
@@ -318,15 +320,15 @@ class BrillTaggerTrainer(object):
 
         # The user can cancel training manually:
         except KeyboardInterrupt:
-            print("Training stopped manually -- {} rules found".format(len(rules)))
+            print("Training stopped manually -- %d rules found" % len(rules))
 
         # Discard our tag position mapping & rule mappings.
         self._clean()
-        trainstats["finalerrors"] = trainstats["initialerrors"] - sum(
-            trainstats["rulescores"]
+        trainstats['finalerrors'] = trainstats['initialerrors'] - sum(
+            trainstats['rulescores']
         )
-        trainstats["finalacc"] = (
-            1 - trainstats["finalerrors"] / trainstats["tokencount"]
+        trainstats['finalacc'] = (
+            1 - trainstats['finalerrors'] / trainstats['tokencount']
         )
         # Create and return a tagger from the rules we found.
         return BrillTagger(self._initial_tagger, rules, trainstats)
@@ -600,30 +602,30 @@ class BrillTaggerTrainer(object):
         rulestr = rule.format(self._ruleformat)
         if self._trace > 2:
             print(
-                "{:4d}{:4d}{:4d}{:4d}  |".format(score, num_fixed, num_broken, num_other), end=" "
+                '%4d%4d%4d%4d  |' % (score, num_fixed, num_broken, num_other), end=' '
             )
             print(
                 textwrap.fill(
                     rulestr,
-                    initial_indent=" " * 20,
+                    initial_indent=' ' * 20,
                     width=79,
-                    subsequent_indent=" " * 18 + "|   ",
+                    subsequent_indent=' ' * 18 + '|   ',
                 ).strip()
             )
         else:
             print(rulestr)
 
     def _trace_apply(self, num_updates):
-        prefix = " " * 18 + "|"
+        prefix = ' ' * 18 + '|'
         print(prefix)
-        print(prefix, "Applying rule to {} positions.".format(num_updates))
+        print(prefix, 'Applying rule to %d positions.' % num_updates)
 
     def _trace_update_rules(self, num_obsolete, num_new, num_unseen):
-        prefix = " " * 18 + "|"
-        print(prefix, "Updated rule tables:")
-        print(prefix, ("  - {} rule applications removed".format(num_obsolete)))
+        prefix = ' ' * 18 + '|'
+        print(prefix, 'Updated rule tables:')
+        print(prefix, ('  - %d rule applications removed' % num_obsolete))
         print(
             prefix,
-            ("  - {} rule applications added ({} novel)".format(num_new, num_unseen)),
+            ('  - %d rule applications added (%d novel)' % (num_new, num_unseen)),
         )
         print(prefix)
index 48f9de1..828125f 100644 (file)
@@ -1,7 +1,7 @@
 # -*- coding: utf-8 -*-
 # Natural Language Toolkit: Interface to the CRFSuite Tagger
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Long Duong <longdt219@gmail.com>
 # URL: <http://nltk.org/>
 # For license information, see LICENSE.TXT
@@ -9,7 +9,8 @@
 """
 A module for POS tagging using CRFSuite
 """
-
+from __future__ import absolute_import
+from __future__ import unicode_literals
 import unicodedata
 import re
 from nltk.tag.api import TaggerI
@@ -78,7 +79,7 @@ class CRFTagger(TaggerI):
 
         """
 
-        self._model_file = ""
+        self._model_file = ''
         self._tagger = pycrfsuite.Tagger()
 
         if feature_func is None:
@@ -88,7 +89,7 @@ class CRFTagger(TaggerI):
 
         self._verbose = verbose
         self._training_options = training_opt
-        self._pattern = re.compile(r"\d")
+        self._pattern = re.compile(r'\d')
 
     def set_model_file(self, model_file):
         self._model_file = model_file
@@ -117,31 +118,31 @@ class CRFTagger(TaggerI):
 
         # Capitalization
         if token[0].isupper():
-            feature_list.append("CAPITALIZATION")
+            feature_list.append('CAPITALIZATION')
 
         # Number
         if re.search(self._pattern, token) is not None:
-            feature_list.append("HAS_NUM")
+            feature_list.append('HAS_NUM')
 
         # Punctuation
         punc_cat = set(["Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po"])
         if all(unicodedata.category(x) in punc_cat for x in token):
-            feature_list.append("PUNCTUATION")
+            feature_list.append('PUNCTUATION')
 
         # Suffix up to length 3
         if len(token) > 1:
-            feature_list.append("SUF_" + token[-1:])
+            feature_list.append('SUF_' + token[-1:])
         if len(token) > 2:
-            feature_list.append("SUF_" + token[-2:])
+            feature_list.append('SUF_' + token[-2:])
         if len(token) > 3:
-            feature_list.append("SUF_" + token[-3:])
+            feature_list.append('SUF_' + token[-3:])
 
-        feature_list.append("WORD_" + token)
+        feature_list.append('WORD_' + token)
 
         return feature_list
 
     def tag_sents(self, sents):
-        """
+        '''
         Tag a list of sentences. NB before using this function, user should specify the mode_file either by
                        - Train a new model using ``train'' function
                        - Use the pre-trained model which is set via ``set_model_file'' function
@@ -149,10 +150,10 @@ class CRFTagger(TaggerI):
         :type sentences : list(list(str))
         :return : list of tagged sentences.
         :rtype : list (list (tuple(str,str)))
-        """
-        if self._model_file == "":
+        '''
+        if self._model_file == '':
             raise Exception(
-                " No model file is found !! Please use train or set_model_file function"
+                ' No model file is found !! Please use train or set_model_file function'
             )
 
         # We need the list of sentences instead of the list generator for matching the input and output
@@ -162,7 +163,7 @@ class CRFTagger(TaggerI):
             labels = self._tagger.tag(features)
 
             if len(labels) != len(tokens):
-                raise Exception(" Predicted Length Not Matched, Expect Errors !")
+                raise Exception(' Predicted Length Not Matched, Expect Errors !')
 
             tagged_sent = list(zip(tokens, labels))
             result.append(tagged_sent)
@@ -170,13 +171,13 @@ class CRFTagger(TaggerI):
         return result
 
     def train(self, train_data, model_file):
-        """
+        '''
         Train the CRF tagger using CRFSuite
         :params train_data : is the list of annotated sentences.
         :type train_data : list (list(tuple(str,str)))
         :params model_file : the model will be saved to this file.
 
-        """
+        '''
         trainer = pycrfsuite.Trainer(verbose=self._verbose)
         trainer.set_params(self._training_options)
 
@@ -191,7 +192,7 @@ class CRFTagger(TaggerI):
         self.set_model_file(model_file)
 
     def tag(self, tokens):
-        """
+        '''
         Tag a sentence using Python CRFSuite Tagger. NB before using this function, user should specify the mode_file either by
                        - Train a new model using ``train'' function
                        - Use the pre-trained model which is set via ``set_model_file'' function
@@ -199,6 +200,6 @@ class CRFTagger(TaggerI):
         :type tokens : list(str)
         :return : list of tagged tokens.
         :rtype : list (tuple(str,str))
-        """
+        '''
 
         return self.tag_sents([tokens])[0]
index 6e543d9..5e834dc 100644 (file)
@@ -1,6 +1,6 @@
 # Natural Language Toolkit: Hidden Markov Model
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Trevor Cohn <tacohn@csse.unimelb.edu.au>
 #         Philip Blunsom <pcbl@csse.unimelb.edu.au>
 #         Tiago Tresoldi <tiago@tresoldi.pro.br> (fixes)
@@ -68,10 +68,13 @@ of EM.
 For more information, please consult the source code for this module,
 which includes extensive demonstration code.
 """
+from __future__ import print_function, unicode_literals, division
 
 import re
 import itertools
 
+from six.moves import map, zip
+
 try:
     import numpy as np
 except ImportError:
@@ -90,6 +93,7 @@ from nltk.probability import (
 )
 from nltk.metrics import accuracy
 from nltk.util import LazyMap, unique_list
+from nltk.compat import python_2_unicode_compatible
 from nltk.tag.api import TaggerI
 
 
@@ -101,6 +105,7 @@ def _identity(labeled_symbols):
     return labeled_symbols
 
 
+@python_2_unicode_compatible
 class HiddenMarkovModelTagger(TaggerI):
     """
     Hidden Markov model class, a generative model for labelling sequence data.
@@ -180,15 +185,15 @@ class HiddenMarkovModelTagger(TaggerI):
         )
 
         if test_sequence:
-            hmm.test(test_sequence, verbose=kwargs.get("verbose", False))
+            hmm.test(test_sequence, verbose=kwargs.get('verbose', False))
 
         if unlabeled_sequence:
-            max_iterations = kwargs.get("max_iterations", 5)
+            max_iterations = kwargs.get('max_iterations', 5)
             hmm = trainer.train_unsupervised(
                 unlabeled_sequence, model=hmm, max_iterations=max_iterations
             )
             if test_sequence:
-                hmm.test(test_sequence, verbose=kwargs.get("verbose", False))
+                hmm.test(test_sequence, verbose=kwargs.get('verbose', False))
 
         return hmm
 
@@ -519,7 +524,7 @@ class HiddenMarkovModelTagger(TaggerI):
             if cum_p <= p <= cum_p + add_p:
                 return sample
             cum_p += add_p
-        raise Exception("Invalid probability distribution - " "does not sum to one")
+        raise Exception('Invalid probability distribution - ' 'does not sum to one')
 
     def entropy(self, unlabeled_sequence):
         """
@@ -564,7 +569,7 @@ class HiddenMarkovModelTagger(TaggerI):
         for i, state in enumerate(self._states):
             p = 2 ** (alpha[0, i] + beta[0, i] - normalisation)
             entropy -= p * self._priors.logprob(state)
-            # print('p(s_0 = %s) =' % state, p)
+            # print 'p(s_0 = %s) =' % state, p
 
         # state transitions
         for t0 in range(T - 1):
@@ -579,7 +584,7 @@ class HiddenMarkovModelTagger(TaggerI):
                         - normalisation
                     )
                     entropy -= p * self._transitions[s0].logprob(s1)
-                    # print('p(s_%d = %s, s_%d = %s) =' % (t0, s0, t1, s1), p)
+                    # print 'p(s_%d = %s, s_%d = %s) =' % (t0, s0, t1, s1), p
 
         # symbol emissions
         for t in range(T):
@@ -588,7 +593,7 @@ class HiddenMarkovModelTagger(TaggerI):
                 entropy -= p * self._outputs[state].logprob(
                     unlabeled_sequence[t][_TEXT]
                 )
-                # print('p(s_%d = %s) =' % (t, state), p)
+                # print 'p(s_%d = %s) =' % (t, state), p
 
         return entropy
 
@@ -640,6 +645,15 @@ class HiddenMarkovModelTagger(TaggerI):
             log_probs.append(lp)
         normalisation = _log_add(*log_probs)
 
+        # ps = zeros((T, N), float64)
+        # for labelling, lp in zip(labellings, log_probs):
+        # for t in range(T):
+        # ps[t, self._states.index(labelling[t])] += \
+        #    2**(lp - normalisation)
+
+        # for t in range(T):
+        # print 'prob[%d] =' % t, ps[t]
+
         entropy = 0
         for lp in log_probs:
             lp -= normalisation
@@ -803,33 +817,33 @@ class HiddenMarkovModelTagger(TaggerI):
         if verbose:
             for test_sent, predicted_sent in zip(test_sequence, predicted_sequence):
                 print(
-                    "Test:",
-                    " ".join("%s/%s" % (token, tag) for (token, tag) in test_sent),
+                    'Test:',
+                    ' '.join('%s/%s' % (token, tag) for (token, tag) in test_sent),
                 )
                 print()
-                print("Untagged:", " ".join("%s" % token for (token, tag) in test_sent))
+                print('Untagged:', ' '.join("%s" % token for (token, tag) in test_sent))
                 print()
                 print(
-                    "HMM-tagged:",
-                    " ".join("%s/%s" % (token, tag) for (token, tag) in predicted_sent),
+                    'HMM-tagged:',
+                    ' '.join('%s/%s' % (token, tag) for (token, tag) in predicted_sent),
                 )
                 print()
                 print(
-                    "Entropy:",
+                    'Entropy:',
                     self.entropy([(token, None) for (token, tag) in predicted_sent]),
                 )
                 print()
-                print("-" * 60)
+                print('-' * 60)
 
         test_tags = flatten(map(tags, test_sequence))
         predicted_tags = flatten(map(tags, predicted_sequence))
 
         acc = accuracy(test_tags, predicted_tags)
         count = sum(len(sent) for sent in test_sequence)
-        print("accuracy over %d tokens: %.2f" % (count, acc * 100))
+        print('accuracy over %d tokens: %.2f' % (count, acc * 100))
 
     def __repr__(self):
-        return "<HiddenMarkovModelTagger %d states and %d output symbols>" % (
+        return '<HiddenMarkovModelTagger %d states and %d output symbols>' % (
             len(self._states),
             len(self._symbols),
         )
@@ -878,7 +892,7 @@ class HiddenMarkovModelTrainer(object):
             model = self.train_supervised(labeled_sequences, **kwargs)
         if unlabeled_sequences:
             if model:
-                kwargs["model"] = model
+                kwargs['model'] = model
             model = self.train_unsupervised(unlabeled_sequences, **kwargs)
         return model
 
@@ -954,7 +968,7 @@ class HiddenMarkovModelTrainer(object):
 
         # create a uniform HMM, which will be iteratively refined, unless
         # given an existing model
-        model = kwargs.get("model")
+        model = kwargs.get('model')
         if not model:
             priors = RandomProbDist(self._states)
             transitions = DictionaryConditionalProbDist(
@@ -998,8 +1012,8 @@ class HiddenMarkovModelTrainer(object):
         converged = False
         last_logprob = None
         iteration = 0
-        max_iterations = kwargs.get("max_iterations", 1000)
-        epsilon = kwargs.get("convergence_logprob", 1e-6)
+        max_iterations = kwargs.get('max_iterations', 1000)
+        epsilon = kwargs.get('convergence_logprob', 1e-6)
 
         while not converged and iteration < max_iterations:
             A_numer = _ninf_array((N, N))
@@ -1064,7 +1078,7 @@ class HiddenMarkovModelTrainer(object):
             if iteration > 0 and abs(logprob - last_logprob) < epsilon:
                 converged = True
 
-            print("iteration", iteration, "logprob", logprob)
+            print('iteration', iteration, 'logprob', logprob)
             iteration += 1
             last_logprob = logprob
 
@@ -1179,8 +1193,8 @@ def _market_hmm_example():
     """
     Return an example HMM (described at page 381, Huang et al)
     """
-    states = ["bull", "bear", "static"]
-    symbols = ["up", "down", "unchanged"]
+    states = ['bull', 'bear', 'static']
+    symbols = ['up', 'down', 'unchanged']
     A = np.array([[0.6, 0.2, 0.2], [0.5, 0.3, 0.2], [0.4, 0.1, 0.5]], np.float64)
     B = np.array([[0.7, 0.1, 0.2], [0.1, 0.6, 0.3], [0.3, 0.3, 0.4]], np.float64)
     pi = np.array([0.5, 0.2, 0.3], np.float64)
@@ -1198,34 +1212,34 @@ def demo():
 
     model, states, symbols = _market_hmm_example()
 
-    print("Testing", model)
+    print('Testing', model)
 
     for test in [
-        ["up", "up"],
-        ["up", "down", "up"],
-        ["down"] * 5,
-        ["unchanged"] * 5 + ["up"],
+        ['up', 'up'],
+        ['up', 'down', 'up'],
+        ['down'] * 5,
+        ['unchanged'] * 5 + ['up'],
     ]:
 
         sequence = [(t, None) for t in test]
 
-        print("Testing with state sequence", test)
-        print("probability =", model.probability(sequence))
-        print("tagging =    ", model.tag([word for (word, tag) in sequence]))
-        print("p(tagged) =  ", model.probability(sequence))
-        print("H =          ", model.entropy(sequence))
-        print("H_exh =      ", model._exhaustive_entropy(sequence))
-        print("H(point) =   ", model.point_entropy(sequence))
-        print("H_exh(point)=", model._exhaustive_point_entropy(sequence))
+        print('Testing with state sequence', test)
+        print('probability =', model.probability(sequence))
+        print('tagging =    ', model.tag([word for (word, tag) in sequence]))
+        print('p(tagged) =  ', model.probability(sequence))
+        print('H =          ', model.entropy(sequence))
+        print('H_exh =      ', model._exhaustive_entropy(sequence))
+        print('H(point) =   ', model.point_entropy(sequence))
+        print('H_exh(point)=', model._exhaustive_point_entropy(sequence))
         print()
 
 
 def load_pos(num_sents):
     from nltk.corpus import brown
 
-    sentences = brown.tagged_sents(categories="news")[:num_sents]
+    sentences = brown.tagged_sents(categories='news')[:num_sents]
 
-    tag_re = re.compile(r"[*]|--|[^+*-]+")
+    tag_re = re.compile(r'[*]|--|[^+*-]+')
     tag_set = set()
     symbols = set()
 
@@ -1251,7 +1265,7 @@ def demo_pos():
     print("HMM POS tagging demo")
     print()
 
-    print("Training HMM...")
+    print('Training HMM...')
     labelled_sequences, tag_set, symbols = load_pos(20000)
     trainer = HiddenMarkovModelTrainer(tag_set, symbols)
     hmm = trainer.train_supervised(
@@ -1259,7 +1273,7 @@ def demo_pos():
         estimator=lambda fd, bins: LidstoneProbDist(fd, 0.1, bins),
     )
 
-    print("Testing...")
+    print('Testing...')
     hmm.test(labelled_sequences[:10], verbose=True)
 
 
@@ -1279,7 +1293,7 @@ def demo_pos_bw(
     print("Baum-Welch demo for POS tagging")
     print()
 
-    print("Training HMM (supervised, %d sentences)..." % supervised)
+    print('Training HMM (supervised, %d sentences)...' % supervised)
 
     sentences, tag_set, symbols = load_pos(test + supervised + unsupervised)
 
@@ -1296,7 +1310,7 @@ def demo_pos_bw(
 
     hmm.test(sentences[:test], verbose=verbose)
 
-    print("Training (unsupervised, %d sentences)..." % unsupervised)
+    print('Training (unsupervised, %d sentences)...' % unsupervised)
     # it's rather slow - so only use 10 samples by default
     unlabeled = _untag(sentences[test + supervised :])
     hmm = trainer.train_unsupervised(
index 3053e8c..9513338 100644 (file)
@@ -1,7 +1,7 @@
 # -*- coding: utf-8 -*-
 # Natural Language Toolkit: Interface to the HunPos POS-tagger
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Peter Ljunglöf <peter.ljunglof@heatherleaf.se>
 #         Dávid Márk Nemeskey <nemeskeyd@gmail.com> (modifications)
 #         Attila Zséder <zseder@gmail.com> (modifications)
@@ -15,12 +15,14 @@ A module for interfacing with the HunPos open-source POS-tagger.
 import os
 from subprocess import Popen, PIPE
 
+from six import text_type
+
 from nltk.internals import find_binary, find_file
 from nltk.tag.api import TaggerI
 
-_hunpos_url = "http://code.google.com/p/hunpos/"
+_hunpos_url = 'http://code.google.com/p/hunpos/'
 
-_hunpos_charset = "ISO-8859-1"
+_hunpos_charset = 'ISO-8859-1'
 """The default encoding used by hunpos: ISO-8859-1."""
 
 
@@ -68,27 +70,27 @@ class HunposTagger(TaggerI):
         """
         self._closed = True
         hunpos_paths = [
-            ".",
-            "/usr/bin",
-            "/usr/local/bin",
-            "/opt/local/bin",
-            "/Applications/bin",
-            "~/bin",
-            "~/Applications/bin",
+            '.',
+            '/usr/bin',
+            '/usr/local/bin',
+            '/opt/local/bin',
+            '/Applications/bin',
+            '~/bin',
+            '~/Applications/bin',
         ]
         hunpos_paths = list(map(os.path.expanduser, hunpos_paths))
 
         self._hunpos_bin = find_binary(
-            "hunpos-tag",
+            'hunpos-tag',
             path_to_bin,
-            env_vars=("HUNPOS_TAGGER",),
+            env_vars=('HUNPOS_TAGGER',),
             searchpath=hunpos_paths,
             url=_hunpos_url,
             verbose=verbose,
         )
 
         self._hunpos_model = find_file(
-            path_to_model, env_vars=("HUNPOS_TAGGER",), verbose=verbose
+            path_to_model, env_vars=('HUNPOS_TAGGER',), verbose=verbose
         )
         self._encoding = encoding
         self._hunpos = Popen(
@@ -121,7 +123,7 @@ class HunposTagger(TaggerI):
         """
         for token in tokens:
             assert "\n" not in token, "Tokens should not contain newlines"
-            if isinstance(token, str):
+            if isinstance(token, text_type):
                 token = token.encode(self._encoding)
             self._hunpos.stdin.write(token + b"\n")
         # We write a final empty line to tell hunpos that the sentence is finished:
@@ -144,6 +146,6 @@ def setup_module(module):
     from nose import SkipTest
 
     try:
-        HunposTagger("en_wsj.model")
+        HunposTagger('en_wsj.model')
     except LookupError:
         raise SkipTest("HunposTagger is not available")
index 9dedbeb..2e38365 100644 (file)
@@ -1,6 +1,6 @@
 # Natural Language Toolkit: Tagset Mapping
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Nathan Schneider <nathan@cmu.edu>
 #         Steven Bird <stevenbird1@gmail.com>
 # URL: <http://nltk.org/>
@@ -29,6 +29,7 @@ X - other: foreign words, typos, abbreviations
 
 """
 
+from __future__ import print_function, unicode_literals, division
 from collections import defaultdict
 from os.path import join
 
@@ -36,44 +37,44 @@ from nltk.data import load
 
 _UNIVERSAL_DATA = "taggers/universal_tagset"
 _UNIVERSAL_TAGS = (
-    "VERB",
-    "NOUN",
-    "PRON",
-    "ADJ",
-    "ADV",
-    "ADP",
-    "CONJ",
-    "DET",
-    "NUM",
-    "PRT",
-    "X",
-    ".",
+    'VERB',
+    'NOUN',
+    'PRON',
+    'ADJ',
+    'ADV',
+    'ADP',
+    'CONJ',
+    'DET',
+    'NUM',
+    'PRT',
+    'X',
+    '.',
 )
 
 # _MAPPINGS = defaultdict(lambda: defaultdict(dict))
 # the mapping between tagset T1 and T2 returns UNK if appied to an unrecognized tag
-_MAPPINGS = defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: "UNK")))
+_MAPPINGS = defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: 'UNK')))
 
 
 def _load_universal_map(fileid):
-    contents = load(join(_UNIVERSAL_DATA, fileid + ".map"), format="text")
+    contents = load(join(_UNIVERSAL_DATA, fileid + '.map'), format="text")
 
     # When mapping to the Universal Tagset,
     # map unknown inputs to 'X' not 'UNK'
-    _MAPPINGS[fileid]["universal"].default_factory = lambda: "X"
+    _MAPPINGS[fileid]['universal'].default_factory = lambda: 'X'
 
     for line in contents.splitlines():
         line = line.strip()
-        if line == "":
+        if line == '':
             continue
-        fine, coarse = line.split("\t")
+        fine, coarse = line.split('\t')
 
-        assert coarse in _UNIVERSAL_TAGS, "Unexpected coarse tag: {}".format(coarse)
+        assert coarse in _UNIVERSAL_TAGS, 'Unexpected coarse tag: {}'.format(coarse)
         assert (
-            fine not in _MAPPINGS[fileid]["universal"]
-        ), "Multiple entries for original tag: {}".format(fine)
+            fine not in _MAPPINGS[fileid]['universal']
+        ), 'Multiple entries for original tag: {}'.format(fine)
 
-        _MAPPINGS[fileid]["universal"][fine] = coarse
+        _MAPPINGS[fileid]['universal'][fine] = coarse
 
 
 def tagset_mapping(source, target):
@@ -87,28 +88,28 @@ def tagset_mapping(source, target):
     """
 
     if source not in _MAPPINGS or target not in _MAPPINGS[source]:
-        if target == "universal":
+        if target == 'universal':
             _load_universal_map(source)
             # Added the new Russian National Corpus mappings because the
             # Russian model for nltk.pos_tag() uses it.
-            _MAPPINGS["ru-rnc-new"]["universal"] = {
-                "A": "ADJ",
-                "A-PRO": "PRON",
-                "ADV": "ADV",
-                "ADV-PRO": "PRON",
-                "ANUM": "ADJ",
-                "CONJ": "CONJ",
-                "INTJ": "X",
-                "NONLEX": ".",
-                "NUM": "NUM",
-                "PARENTH": "PRT",
-                "PART": "PRT",
-                "PR": "ADP",
-                "PRAEDIC": "PRT",
-                "PRAEDIC-PRO": "PRON",
-                "S": "NOUN",
-                "S-PRO": "PRON",
-                "V": "VERB",
+            _MAPPINGS['ru-rnc-new']['universal'] = {
+                'A': 'ADJ',
+                'A-PRO': 'PRON',
+                'ADV': 'ADV',
+                'ADV-PRO': 'PRON',
+                'ANUM': 'ADJ',
+                'CONJ': 'CONJ',
+                'INTJ': 'X',
+                'NONLEX': '.',
+                'NUM': 'NUM',
+                'PARENTH': 'PRT',
+                'PART': 'PRT',
+                'PR': 'ADP',
+                'PRAEDIC': 'PRT',
+                'PRAEDIC-PRO': 'PRON',
+                'S': 'NOUN',
+                'S-PRO': 'PRON',
+                'V': 'VERB',
             }
 
     return _MAPPINGS[source][target]
@@ -127,10 +128,10 @@ def map_tag(source, target, source_tag):
     """
 
     # we need a systematic approach to naming
-    if target == "universal":
-        if source == "wsj":
-            source = "en-ptb"
-        if source == "brown":
-            source = "en-brown"
+    if target == 'universal':
+        if source == 'wsj':
+            source = 'en-ptb'
+        if source == 'brown':
+            source = 'en-brown'
 
     return tagset_mapping(source, target)[source_tag]
index 1742a59..548c004 100644 (file)
@@ -9,6 +9,9 @@
 #
 # This module is provided under the terms of the MIT License.
 
+from __future__ import absolute_import
+from __future__ import print_function, division
+
 import random
 from collections import defaultdict
 import pickle
@@ -16,30 +19,22 @@ import logging
 
 from nltk.tag.api import TaggerI
 from nltk.data import find, load
-
-from nltk import jsontags
-
-try:
-    import numpy as np
-except ImportError:
-    pass
+from nltk.compat import python_2_unicode_compatible
 
 PICKLE = "averaged_perceptron_tagger.pickle"
 
-@jsontags.register_tag
-class AveragedPerceptron:
 
-    """An averaged perceptron, as implemented by Matthew Honnibal.
+class AveragedPerceptron(object):
+
+    '''An averaged perceptron, as implemented by Matthew Honnibal.
 
     See more implementation details here:
         https://explosion.ai/blog/part-of-speech-pos-tagger-in-python
-    """
-
-    json_tag = "nltk.tag.perceptron.AveragedPerceptron"
+    '''
 
-    def __init__(self, weights=None):
+    def __init__(self):
         # Each feature gets its own weight vector, so weights is a dict-of-dicts
-        self.weights = weights if weights else {}
+        self.weights = {}
         self.classes = set()
         # The accumulated values, for the averaging. These will be keyed by
         # feature/clas tuples
@@ -51,13 +46,8 @@ class AveragedPerceptron:
         # Number of instances seen
         self.i = 0
 
-    def _softmax(self, scores):
-        s = np.fromiter(scores.values(), dtype=float)
-        exps = np.exp(s)
-        return exps / np.sum(exps)
-
-    def predict(self, features, return_conf=False):
-        """Dot-product the features and current weights and return the best label."""
+    def predict(self, features):
+        '''Dot-product the features and current weights and return the best label.'''
         scores = defaultdict(float)
         for feat, value in features.items():
             if feat not in self.weights or value == 0:
@@ -65,16 +55,11 @@ class AveragedPerceptron:
             weights = self.weights[feat]
             for label, weight in weights.items():
                 scores[label] += value * weight
-
         # Do a secondary alphabetic sort, for stability
-        best_label = max(self.classes, key=lambda label: (scores[label], label))
-        # compute the confidence
-        conf = max(self._softmax(scores)) if return_conf == True else None
-
-        return best_label, conf
+        return max(self.classes, key=lambda label: (scores[label], label))
 
     def update(self, truth, guess, features):
-        """Update the feature weights."""
+        '''Update the feature weights.'''
 
         def upd_feat(c, f, w, v):
             param = (f, c)
@@ -91,7 +76,7 @@ class AveragedPerceptron:
             upd_feat(guess, f, weights.get(guess, 0.0), -1.0)
 
     def average_weights(self):
-        """Average weights from all iterations."""
+        '''Average weights from all iterations.'''
         for feat, weights in self.weights.items():
             new_feat_weights = {}
             for clas, weight in weights.items():
@@ -104,26 +89,19 @@ class AveragedPerceptron:
             self.weights[feat] = new_feat_weights
 
     def save(self, path):
-        """Save the pickled model weights."""
-        with open(path, "wb") as fout:
+        '''Save the pickled model weights.'''
+        with open(path, 'wb') as fout:
             return pickle.dump(dict(self.weights), fout)
 
     def load(self, path):
-        """Load the pickled model weights."""
+        '''Load the pickled model weights.'''
         self.weights = load(path)
 
-    def encode_json_obj(self):
-        return self.weights
-
-    @classmethod
-    def decode_json_obj(cls, obj):
-        return cls(obj)
 
-
-@jsontags.register_tag
+@python_2_unicode_compatible
 class PerceptronTagger(TaggerI):
 
-    """
+    '''
     Greedy Averaged Perceptron tagger, as implemented by Matthew Honnibal.
     See more implementation details here:
         https://explosion.ai/blog/part-of-speech-pos-tagger-in-python
@@ -149,59 +127,54 @@ class PerceptronTagger(TaggerI):
 
     >>> pretrain.tag("The red cat".split())
     [('The', 'DT'), ('red', 'JJ'), ('cat', 'NN')]
-    """
-
-    json_tag = "nltk.tag.sequential.PerceptronTagger" 
+    '''
 
-    START = ["-START-", "-START2-"]
-    END = ["-END-", "-END2-"]
+    START = ['-START-', '-START2-']
+    END = ['-END-', '-END2-']
 
     def __init__(self, load=True):
-        """
+        '''
         :param load: Load the pickled model upon instantiation.
-        """
+        '''
         self.model = AveragedPerceptron()
         self.tagdict = {}
         self.classes = set()
         if load:
-            AP_MODEL_LOC = "file:" + str(
-                find("taggers/averaged_perceptron_tagger/" + PICKLE)
+            AP_MODEL_LOC = 'file:' + str(
+                find('taggers/averaged_perceptron_tagger/' + PICKLE)
             )
             self.load(AP_MODEL_LOC)
 
-    def tag(self, tokens, return_conf=False, use_tagdict=True):
-        """
+    def tag(self, tokens):
+        '''
         Tag tokenized sentences.
         :params tokens: list of word
         :type tokens: list(str)
-        """
+        '''
         prev, prev2 = self.START
         output = []
 
         context = self.START + [self.normalize(w) for w in tokens] + self.END
         for i, word in enumerate(tokens):
-            tag, conf = (
-                (self.tagdict.get(word), 1.0) if use_tagdict == True else (None, None)
-            )
+            tag = self.tagdict.get(word)
             if not tag:
                 features = self._get_features(i, word, context, prev, prev2)
-                tag, conf = self.model.predict(features, return_conf)
-            output.append((word, tag, conf) if return_conf == True else (word, tag))
-
+                tag = self.model.predict(features)
+            output.append((word, tag))
             prev2 = prev
             prev = tag
 
         return output
 
     def train(self, sentences, save_loc=None, nr_iter=5):
-        """Train a model from sentences, and save it at ``save_loc``. ``nr_iter``
+        '''Train a model from sentences, and save it at ``save_loc``. ``nr_iter``
         controls the number of Perceptron training iterations.
 
         :param sentences: A list or iterator of sentences, where each sentence
             is a list of (words, tags) tuples.
         :param save_loc: If not ``None``, saves a pickled model in this location.
         :param nr_iter: Number of training iterations.
-        """
+        '''
         # We'd like to allow ``sentences`` to be either a list or an iterator,
         # the latter being especially important for a large training dataset.
         # Because ``self._make_tagdict(sentences)`` runs regardless, we make
@@ -224,7 +197,7 @@ class PerceptronTagger(TaggerI):
                     guess = self.tagdict.get(word)
                     if not guess:
                         feats = self._get_features(i, word, context, prev, prev2)
-                        guess, _ = self.model.predict(feats)
+                        guess = self.model.predict(feats)
                         self.model.update(tags[i], guess, feats)
                     prev2 = prev
                     prev = guess
@@ -240,81 +213,70 @@ class PerceptronTagger(TaggerI):
         self.model.average_weights()
         # Pickle as a binary file
         if save_loc is not None:
-            with open(save_loc, "wb") as fout:
+            with open(save_loc, 'wb') as fout:
                 # changed protocol from -1 to 2 to make pickling Python 2 compatible
                 pickle.dump((self.model.weights, self.tagdict, self.classes), fout, 2)
 
     def load(self, loc):
-        """
+        '''
         :param loc: Load a pickled model at location.
         :type loc: str
-        """
+        '''
 
         self.model.weights, self.tagdict, self.classes = load(loc)
         self.model.classes = self.classes
 
-    def encode_json_obj(self):
-        return self.model.weights, self.tagdict, list(self.classes)
-
-    @classmethod
-    def decode_json_obj(cls, obj):
-        tagger = cls(load=False)
-        tagger.model.weights, tagger.tagdict, tagger.classes = obj
-        tagger.classes = set(tagger.classes)
-        tagger.model.classes = tagger.classes
-        return tagger
-
     def normalize(self, word):
-        """
+        '''
         Normalization used in pre-processing.
         - All words are lower cased
         - Groups of digits of length 4 are represented as !YEAR;
         - Other digits are represented as !DIGITS
 
         :rtype: str
-        """
-        if "-" in word and word[0] != "-":
-            return "!HYPHEN"
+        '''
+        if '-' in word and word[0] != '-':
+            return '!HYPHEN'
         elif word.isdigit() and len(word) == 4:
-            return "!YEAR"
+            return '!YEAR'
         elif word[0].isdigit():
-            return "!DIGITS"
+            return '!DIGITS'
         else:
             return word.lower()
 
     def _get_features(self, i, word, context, prev, prev2):
-        """Map tokens into a feature representation, implemented as a
+        '''Map tokens into a feature representation, implemented as a
         {hashable: int} dict. If the features change, a new model must be
         trained.
-        """
+        '''
 
         def add(name, *args):
-            features[" ".join((name,) + tuple(args))] += 1
+            features[' '.join((name,) + tuple(args))] += 1
 
         i += len(self.START)
         features = defaultdict(int)
         # It's useful to have a constant feature, which acts sort of like a prior
-        add("bias")
-        add("i suffix", word[-3:])
-        add("i pref1", word[0])
-        add("i-1 tag", prev)
-        add("i-2 tag", prev2)
-        add("i tag+i-2 tag", prev, prev2)
-        add("i word", context[i])
-        add("i-1 tag+i word", prev, context[i])
-        add("i-1 word", context[i - 1])
-        add("i-1 suffix", context[i - 1][-3:])
-        add("i-2 word", context[i - 2])
-        add("i+1 word", context[i + 1])
-        add("i+1 suffix", context[i + 1][-3:])
-        add("i+2 word", context[i + 2])
+        add('bias')
+        add('i suffix', word[-3:])
+        add('i pref1', word[0])
+        add('i-1 tag', prev)
+        add('i-2 tag', prev2)
+        add('i tag+i-2 tag', prev, prev2)
+        add('i word', context[i])
+        add('i-1 tag+i word', prev, context[i])
+        add('i-1 word', context[i - 1])
+        add('i-1 suffix', context[i - 1][-3:])
+        add('i-2 word', context[i - 2])
+        add('i+1 word', context[i + 1])
+        add('i+1 suffix', context[i + 1][-3:])
+        add('i+2 word', context[i + 2])
         return features
 
     def _make_tagdict(self, sentences):
-        """
+        '''
         Make a tag dictionary for single-tag words.
         :param sentences: A list of list of (word, tag) tuples.
-        """
+        '''
         counts = defaultdict(lambda: defaultdict(int))
         for sentence in sentences:
             self._sentences.append(sentence)
@@ -337,8 +299,8 @@ def _pc(n, d):
 
 
 def _load_data_conll_format(filename):
-    print("Read from file: ", filename)
-    with open(filename, "rb") as fin:
+    print('Read from file: ', filename)
+    with open(filename, 'rb') as fin:
         sentences = []
         sentence = []
         for line in fin.readlines():
@@ -348,7 +310,7 @@ def _load_data_conll_format(filename):
                 sentences.append(sentence)
                 sentence = []
                 continue
-            tokens = line.split("\t")
+            tokens = line.split('\t')
             word = tokens[1]
             tag = tokens[4]
             sentence.append((word, tag))
@@ -360,14 +322,14 @@ def _get_pretrain_model():
     # Train: section 2-11
     # Test : section 23
     tagger = PerceptronTagger()
-    training = _load_data_conll_format("english_ptb_train.conll")
-    testing = _load_data_conll_format("english_ptb_test.conll")
-    print("Size of training and testing (sentence)", len(training), len(testing))
+    training = _load_data_conll_format('english_ptb_train.conll')
+    testing = _load_data_conll_format('english_ptb_test.conll')
+    print('Size of training and testing (sentence)', len(training), len(testing))
     # Train and save the model
     tagger.train(training, PICKLE)
-    print("Accuracy : ", tagger.evaluate(testing))
+    print('Accuracy : ', tagger.evaluate(testing))
 
 
-if __name__ == "__main__":
+if __name__ == '__main__':
     # _get_pretrain_model()
     pass
index 5231d25..8404656 100644 (file)
@@ -1,7 +1,7 @@
 # encoding: utf-8
 # Natural Language Toolkit: Senna POS Tagger
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Rami Al-Rfou' <ralrfou@cs.stonybrook.edu>
 # URL: <http://nltk.org/>
 # For license information, see LICENSE.TXT
@@ -39,13 +39,14 @@ Note: Unit tests for this module can be found in test/unit/test_senna.py
     ('NY', 'B-LOC'), (',', 'O'), ('USA', 'B-LOC'), ('.', 'O')]
 """
 
+from nltk.compat import python_2_unicode_compatible
 from nltk.classify import Senna
 
 
-
+@python_2_unicode_compatible
 class SennaTagger(Senna):
-    def __init__(self, path, encoding="utf-8"):
-        super(SennaTagger, self).__init__(path, ["pos"], encoding)
+    def __init__(self, path, encoding='utf-8'):
+        super(SennaTagger, self).__init__(path, ['pos'], encoding)
 
     def tag_sents(self, sentences):
         """
@@ -56,14 +57,14 @@ class SennaTagger(Senna):
         for i in range(len(tagged_sents)):
             for j in range(len(tagged_sents[i])):
                 annotations = tagged_sents[i][j]
-                tagged_sents[i][j] = (annotations["word"], annotations["pos"])
+                tagged_sents[i][j] = (annotations['word'], annotations['pos'])
         return tagged_sents
 
 
-
+@python_2_unicode_compatible
 class SennaChunkTagger(Senna):
-    def __init__(self, path, encoding="utf-8"):
-        super(SennaChunkTagger, self).__init__(path, ["chk"], encoding)
+    def __init__(self, path, encoding='utf-8'):
+        super(SennaChunkTagger, self).__init__(path, ['chk'], encoding)
 
     def tag_sents(self, sentences):
         """
@@ -74,7 +75,7 @@ class SennaChunkTagger(Senna):
         for i in range(len(tagged_sents)):
             for j in range(len(tagged_sents[i])):
                 annotations = tagged_sents[i][j]
-                tagged_sents[i][j] = (annotations["word"], annotations["chk"])
+                tagged_sents[i][j] = (annotations['word'], annotations['chk'])
         return tagged_sents
 
     def bio_to_chunks(self, tagged_sent, chunk_type):
@@ -105,24 +106,24 @@ class SennaChunkTagger(Senna):
         current_chunk_position = []
         for idx, word_pos in enumerate(tagged_sent):
             word, pos = word_pos
-            if "-" + chunk_type in pos:  # Append the word to the current_chunk.
+            if '-' + chunk_type in pos:  # Append the word to the current_chunk.
                 current_chunk.append((word))
                 current_chunk_position.append((idx))
             else:
                 if current_chunk:  # Flush the full chunk when out of an NP.
-                    _chunk_str = " ".join(current_chunk)
-                    _chunk_pos_str = "-".join(map(str, current_chunk_position))
+                    _chunk_str = ' '.join(current_chunk)
+                    _chunk_pos_str = '-'.join(map(str, current_chunk_position))
                     yield _chunk_str, _chunk_pos_str
                     current_chunk = []
                     current_chunk_position = []
         if current_chunk:  # Flush the last chunk.
-            yield " ".join(current_chunk), "-".join(map(str, current_chunk_position))
-
+            yield ' '.join(current_chunk), '-'.join(map(str, current_chunk_position))
 
 
+@python_2_unicode_compatible
 class SennaNERTagger(Senna):
-    def __init__(self, path, encoding="utf-8"):
-        super(SennaNERTagger, self).__init__(path, ["ner"], encoding)
+    def __init__(self, path, encoding='utf-8'):
+        super(SennaNERTagger, self).__init__(path, ['ner'], encoding)
 
     def tag_sents(self, sentences):
         """
@@ -133,7 +134,7 @@ class SennaNERTagger(Senna):
         for i in range(len(tagged_sents)):
             for j in range(len(tagged_sents[i])):
                 annotations = tagged_sents[i][j]
-                tagged_sents[i][j] = (annotations["word"], annotations["ner"])
+                tagged_sents[i][j] = (annotations['word'], annotations['ner'])
         return tagged_sents
 
 
@@ -142,6 +143,6 @@ def setup_module(module):
     from nose import SkipTest
 
     try:
-        tagger = Senna("/usr/share/senna-v3.0", ["pos", "chk", "ner"])
+        tagger = Senna('/usr/share/senna-v3.0', ['pos', 'chk', 'ner'])
     except OSError:
         raise SkipTest("Senna executable not found")
index e49d3ad..3d3a767 100644 (file)
@@ -1,6 +1,6 @@
 # Natural Language Toolkit: Sequential Backoff Taggers
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Edward Loper <edloper@gmail.com>
 #         Steven Bird <stevenbird1@gmail.com> (minor additions)
 #         Tiago Tresoldi <tresoldi@users.sf.net> (original affix tagger)
@@ -17,13 +17,14 @@ determine a tag for the specified token, then its backoff tagger is
 consulted instead.  Any SequentialBackoffTagger may serve as a
 backoff tagger for any other SequentialBackoffTagger.
 """
-import ast
+from __future__ import print_function, unicode_literals
 from abc import abstractmethod
 
 import re
 
 from nltk.probability import ConditionalFreqDist
 from nltk.classify import NaiveBayesClassifier
+from nltk.compat import python_2_unicode_compatible
 
 from nltk.tag.api import TaggerI, FeaturesetTaggerI
 
@@ -105,6 +106,7 @@ class SequentialBackoffTagger(TaggerI):
         """
 
 
+@python_2_unicode_compatible
 class ContextTagger(SequentialBackoffTagger):
     """
     An abstract base class for sequential backoff taggers that choose
@@ -125,7 +127,7 @@ class ContextTagger(SequentialBackoffTagger):
         :param context_to_tag: A dictionary mapping contexts to tags.
         :param backoff: The backoff tagger that should be used for this tagger.
         """
-        super().__init__(backoff)
+        SequentialBackoffTagger.__init__(self, backoff)
         self._context_to_tag = context_to_tag if context_to_tag else {}
 
     @abstractmethod
@@ -149,7 +151,7 @@ class ContextTagger(SequentialBackoffTagger):
         return len(self._context_to_tag)
 
     def __repr__(self):
-        return "<{}: size={}>".format(self.__class__.__name__, self.size())
+        return '<%s: size=%d>' % (self.__class__.__name__, self.size())
 
     def _train(self, tagged_corpus, cutoff=0, verbose=False):
         """
@@ -207,15 +209,14 @@ class ContextTagger(SequentialBackoffTagger):
             size = len(self._context_to_tag)
             backoff = 100 - (hit_count * 100.0) / token_count
             pruning = 100 - (size * 100.0) / len(fd.conditions())
-            print("[Trained Unigram tagger:", end=" ")
-            print("size={}, backoff={:.2f}%, pruning={:.2f}%]".format(size, backoff, pruning))
+            print("[Trained Unigram tagger:", end=' ')
+            print("size=%d, backoff=%.2f%%, pruning=%.2f%%]" % (size, backoff, pruning))
 
 
 ######################################################################
 # Tagger Classes
 ######################################################################
-
-
+@python_2_unicode_compatible
 @jsontags.register_tag
 class DefaultTagger(SequentialBackoffTagger):
     """
@@ -234,11 +235,11 @@ class DefaultTagger(SequentialBackoffTagger):
     :type tag: str
     """
 
-    json_tag = "nltk.tag.sequential.DefaultTagger"
+    json_tag = 'nltk.tag.sequential.DefaultTagger'
 
     def __init__(self, tag):
         self._tag = tag
-        super().__init__(None)
+        SequentialBackoffTagger.__init__(self, None)
 
     def encode_json_obj(self):
         return self._tag
@@ -252,7 +253,7 @@ class DefaultTagger(SequentialBackoffTagger):
         return self._tag  # ignore token and history
 
     def __repr__(self):
-        return "<DefaultTagger: tag={}>".format(self._tag)
+        return '<DefaultTagger: tag=%s>' % self._tag
 
 
 @jsontags.register_tag
@@ -280,7 +281,7 @@ class NgramTagger(ContextTagger):
         context-to-tag table for the new tagger.
     """
 
-    json_tag = "nltk.tag.sequential.NgramTagger"
+    json_tag = 'nltk.tag.sequential.NgramTagger'
 
     def __init__(
         self, n, train=None, model=None, backoff=None, cutoff=0, verbose=False
@@ -288,34 +289,18 @@ class NgramTagger(ContextTagger):
         self._n = n
         self._check_params(train, model)
 
-        super().__init__(model, backoff)
+        ContextTagger.__init__(self, model, backoff)
 
         if train:
             self._train(train, cutoff, verbose)
 
     def encode_json_obj(self):
-        _context_to_tag = {repr(k): v for k, v in self._context_to_tag.items()}
-        if "NgramTagger" in self.__class__.__name__:
-            return self._n, _context_to_tag, self.backoff
-        else:
-            return _context_to_tag, self.backoff
+        return self._n, self._context_to_tag, self.backoff
 
     @classmethod
     def decode_json_obj(cls, obj):
-        try:
-            _n, _context_to_tag, backoff = obj
-        except ValueError:
-            _context_to_tag, backoff = obj
-
-        if not _context_to_tag:
-            return backoff
-
-        _context_to_tag = {ast.literal_eval(k): v for k, v in _context_to_tag.items()}
-
-        if "NgramTagger" in cls.__name__:
-            return cls(_n, model=_context_to_tag, backoff=backoff)
-        else:
-            return cls(model=_context_to_tag, backoff=backoff)
+        _n, _context_to_tag, backoff = obj
+        return cls(_n, model=_context_to_tag, backoff=backoff)
 
     def context(self, tokens, index, history):
         tag_context = tuple(history[max(0, index - self._n + 1) : index])
@@ -335,7 +320,7 @@ class UnigramTagger(NgramTagger):
         >>> test_sent = brown.sents(categories='news')[0]
         >>> unigram_tagger = UnigramTagger(brown.tagged_sents(categories='news')[:500])
         >>> for tok, tag in unigram_tagger.tag(test_sent):
-        ...     print("({}, {}), ".format(tok, tag))
+        ...     print("(%s, %s), " % (tok, tag))
         (The, AT), (Fulton, NP-TL), (County, NN-TL), (Grand, JJ-TL),
         (Jury, NN-TL), (said, VBD), (Friday, NR), (an, AT),
         (investigation, NN), (of, IN), (Atlanta's, NP$), (recent, JJ),
@@ -355,10 +340,18 @@ class UnigramTagger(NgramTagger):
     :type cutoff: int
     """
 
-    json_tag = "nltk.tag.sequential.UnigramTagger"
+    json_tag = 'nltk.tag.sequential.UnigramTagger'
 
     def __init__(self, train=None, model=None, backoff=None, cutoff=0, verbose=False):
-        super().__init__(1, train, model, backoff, cutoff, verbose)
+        NgramTagger.__init__(self, 1, train, model, backoff, cutoff, verbose)
+
+    def encode_json_obj(self):
+        return self._context_to_tag, self.backoff
+
+    @classmethod
+    def decode_json_obj(cls, obj):
+        _context_to_tag, backoff = obj
+        return cls(model=_context_to_tag, backoff=backoff)
 
     def context(self, tokens, index, history):
         return tokens[index]
@@ -384,10 +377,18 @@ class BigramTagger(NgramTagger):
     :type cutoff: int
     """
 
-    json_tag = "nltk.tag.sequential.BigramTagger"
+    json_tag = 'nltk.tag.sequential.BigramTagger'
 
     def __init__(self, train=None, model=None, backoff=None, cutoff=0, verbose=False):
-        super().__init__(2, train, model, backoff, cutoff, verbose)
+        NgramTagger.__init__(self, 2, train, model, backoff, cutoff, verbose)
+
+    def encode_json_obj(self):
+        return self._context_to_tag, self.backoff
+
+    @classmethod
+    def decode_json_obj(cls, obj):
+        _context_to_tag, backoff = obj
+        return cls(model=_context_to_tag, backoff=backoff)
 
 
 @jsontags.register_tag
@@ -410,10 +411,18 @@ class TrigramTagger(NgramTagger):
     :type cutoff: int
     """
 
-    json_tag = "nltk.tag.sequential.TrigramTagger"
+    json_tag = 'nltk.tag.sequential.TrigramTagger'
 
     def __init__(self, train=None, model=None, backoff=None, cutoff=0, verbose=False):
-        super().__init__(3, train, model, backoff, cutoff, verbose)
+        NgramTagger.__init__(self, 3, train, model, backoff, cutoff, verbose)
+
+    def encode_json_obj(self):
+        return self._context_to_tag, self.backoff
+
+    @classmethod
+    def decode_json_obj(cls, obj):
+        _context_to_tag, backoff = obj
+        return cls(model=_context_to_tag, backoff=backoff)
 
 
 @jsontags.register_tag
@@ -436,7 +445,7 @@ class AffixTagger(ContextTagger):
         tag of None by this tagger.
     """
 
-    json_tag = "nltk.tag.sequential.AffixTagger"
+    json_tag = 'nltk.tag.sequential.AffixTagger'
 
     def __init__(
         self,
@@ -451,7 +460,7 @@ class AffixTagger(ContextTagger):
 
         self._check_params(train, model)
 
-        super().__init__(model, backoff)
+        ContextTagger.__init__(self, model, backoff)
 
         self._affix_length = affix_length
         self._min_word_length = min_stem_length + abs(affix_length)
@@ -487,6 +496,7 @@ class AffixTagger(ContextTagger):
             return token[self._affix_length :]
 
 
+@python_2_unicode_compatible
 @jsontags.register_tag
 class RegexpTagger(SequentialBackoffTagger):
     """
@@ -530,36 +540,36 @@ class RegexpTagger(SequentialBackoffTagger):
         assigned the tag None.
     """
 
-    json_tag = "nltk.tag.sequential.RegexpTagger"
+    json_tag = 'nltk.tag.sequential.RegexpTagger'
 
     def __init__(self, regexps, backoff=None):
         """
         """
-        super().__init__(backoff)
-        try:
-            self._regexps = [(re.compile(regexp), tag,) for regexp, tag in regexps]
-        except Exception as e:
-            raise Exception(
-                'Invalid RegexpTagger regexp:', str(e), 'regexp:', regexp, 'tag:', tag)
+        SequentialBackoffTagger.__init__(self, backoff)
+        self._regexs = [(re.compile(regexp), tag) for regexp, tag in regexps]
 
     def encode_json_obj(self):
-        return [(regexp.pattern, tag) for regexp, tag in self._regexps], self.backoff
+        return [(regexp.patten, tag) for regexp, tag in self._regexs], self.backoff
 
     @classmethod
     def decode_json_obj(cls, obj):
         regexps, backoff = obj
-        return cls(regexps, backoff)
+        self = cls(())
+        self._regexs = [(re.compile(regexp), tag) for regexp, tag in regexps]
+        SequentialBackoffTagger.__init__(self, backoff)
+        return self
 
     def choose_tag(self, tokens, index, history):
-        for regexp, tag in self._regexps:
+        for regexp, tag in self._regexs:
             if re.match(regexp, tokens[index]):
                 return tag
         return None
 
     def __repr__(self):
-        return "<Regexp Tagger: size={}>".format(len(self._regexps))
+        return '<Regexp Tagger: size=%d>' % len(self._regexs)
 
 
+@python_2_unicode_compatible
 class ClassifierBasedTagger(SequentialBackoffTagger, FeaturesetTaggerI):
     """
     A sequential tagger that uses a classifier to choose the tag for
@@ -615,11 +625,11 @@ class ClassifierBasedTagger(SequentialBackoffTagger, FeaturesetTaggerI):
     ):
         self._check_params(train, classifier)
 
-        super().__init__(backoff)
+        SequentialBackoffTagger.__init__(self, backoff)
 
         if (train and classifier) or (not train and not classifier):
             raise ValueError(
-                "Must specify either training data or " "trained classifier."
+                'Must specify either training data or ' 'trained classifier.'
             )
 
         if feature_detector is not None:
@@ -659,7 +669,7 @@ class ClassifierBasedTagger(SequentialBackoffTagger, FeaturesetTaggerI):
 
         classifier_corpus = []
         if verbose:
-            print("Constructing training corpus for classifier.")
+            print('Constructing training corpus for classifier.')
 
         for sentence in tagged_corpus:
             history = []
@@ -670,11 +680,11 @@ class ClassifierBasedTagger(SequentialBackoffTagger, FeaturesetTaggerI):
                 history.append(tags[index])
 
         if verbose:
-            print("Training classifier ({} instances)".format(len(classifier_corpus)))
+            print('Training classifier (%d instances)' % len(classifier_corpus))
         self._classifier = classifier_builder(classifier_corpus)
 
     def __repr__(self):
-        return "<ClassifierBasedTagger: {}>".format(self._classifier)
+        return '<ClassifierBasedTagger: %r>' % self._classifier
 
     def feature_detector(self, tokens, index, history):
         """
@@ -719,32 +729,32 @@ class ClassifierBasedPOSTagger(ClassifierBasedTagger):
             prevtag = history[index - 1]
             prevprevtag = history[index - 2]
 
-        if re.match("[0-9]+(\.[0-9]*)?|[0-9]*\.[0-9]+$", word):
-            shape = "number"
-        elif re.match("\W+$", word):
-            shape = "punct"
-        elif re.match("[A-Z][a-z]+$", word):
-            shape = "upcase"
-        elif re.match("[a-z]+$", word):
-            shape = "downcase"
-        elif re.match("\w+$", word):
-            shape = "mixedcase"
+        if re.match('[0-9]+(\.[0-9]*)?|[0-9]*\.[0-9]+$', word):
+            shape = 'number'
+        elif re.match('\W+$', word):
+            shape = 'punct'
+        elif re.match('[A-Z][a-z]+$', word):
+            shape = 'upcase'
+        elif re.match('[a-z]+$', word):
+            shape = 'downcase'
+        elif re.match('\w+$', word):
+            shape = 'mixedcase'
         else:
-            shape = "other"
+            shape = 'other'
 
         features = {
-            "prevtag": prevtag,
-            "prevprevtag": prevprevtag,
-            "word": word,
-            "word.lower": word.lower(),
-            "suffix3": word.lower()[-3:],
-            "suffix2": word.lower()[-2:],
-            "suffix1": word.lower()[-1:],
-            "prevprevword": prevprevword,
-            "prevword": prevword,
-            "prevtag+word": "{}+{}".format(prevtag, word.lower()),
-            "prevprevtag+word": "{}+{}".format(prevprevtag, word.lower()),
-            "prevword+word": "{}+{}".format(prevword, word.lower()),
-            "shape": shape,
+            'prevtag': prevtag,
+            'prevprevtag': prevprevtag,
+            'word': word,
+            'word.lower': word.lower(),
+            'suffix3': word.lower()[-3:],
+            'suffix2': word.lower()[-2:],
+            'suffix1': word.lower()[-1:],
+            'prevprevword': prevprevword,
+            'prevword': prevword,
+            'prevtag+word': '%s+%s' % (prevtag, word.lower()),
+            'prevprevtag+word': '%s+%s' % (prevprevtag, word.lower()),
+            'prevword+word': '%s+%s' % (prevword, word.lower()),
+            'shape': shape,
         }
         return features
index cd7250c..9916386 100644 (file)
@@ -1,7 +1,7 @@
 # -*- coding: utf-8 -*-
 # Natural Language Toolkit: Interface to the Stanford Part-of-speech and Named-Entity Taggers
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Nitin Madnani <nmadnani@ets.org>
 #         Rami Al-Rfou' <ralrfou@cs.stonybrook.edu>
 # URL: <http://nltk.org/>
@@ -23,10 +23,12 @@ import tempfile
 from subprocess import PIPE
 import warnings
 
+from six import text_type
+
 from nltk.internals import find_file, find_jar, config_java, java, _java_options
 from nltk.tag.api import TaggerI
 
-_stanford_url = "https://nlp.stanford.edu/software"
+_stanford_url = 'https://nlp.stanford.edu/software'
 
 
 class StanfordTagger(TaggerI):
@@ -40,16 +42,16 @@ class StanfordTagger(TaggerI):
     - ``_JAR`` file: Class constant that represents the jar file name.
     """
 
-    _SEPARATOR = ""
-    _JAR = ""
+    _SEPARATOR = ''
+    _JAR = ''
 
     def __init__(
         self,
         model_filename,
         path_to_jar=None,
-        encoding="utf8",
+        encoding='utf8',
         verbose=False,
-        java_options="-mx1000m",
+        java_options='-mx1000m',
     ):
         # Raise deprecation warning.
         warnings.warn(
@@ -64,16 +66,16 @@ class StanfordTagger(TaggerI):
 
         if not self._JAR:
             warnings.warn(
-                "The StanfordTagger class is not meant to be "
-                "instantiated directly. Did you mean "
-                "StanfordPOSTagger or StanfordNERTagger?"
+                'The StanfordTagger class is not meant to be '
+                'instantiated directly. Did you mean '
+                'StanfordPOSTagger or StanfordNERTagger?'
             )
         self._stanford_jar = find_jar(
             self._JAR, path_to_jar, searchpath=(), url=_stanford_url, verbose=verbose
         )
 
         self._stanford_model = find_file(
-            model_filename, env_vars=("STANFORD_MODELS",), verbose=verbose
+            model_filename, env_vars=('STANFORD_MODELS',), verbose=verbose
         )
 
         self._encoding = encoding
@@ -92,19 +94,19 @@ class StanfordTagger(TaggerI):
 
     def tag_sents(self, sentences):
         encoding = self._encoding
-        default_options = " ".join(_java_options)
+        default_options = ' '.join(_java_options)
         config_java(options=self.java_options, verbose=False)
 
         # Create a temporary input file
         _input_fh, self._input_file_path = tempfile.mkstemp(text=True)
 
         cmd = list(self._cmd)
-        cmd.extend(["-encoding", encoding])
+        cmd.extend(['-encoding', encoding])
 
         # Write the actual sentences to the temporary input file
-        _input_fh = os.fdopen(_input_fh, "wb")
-        _input = "\n".join((" ".join(x) for x in sentences))
-        if isinstance(_input, str) and encoding:
+        _input_fh = os.fdopen(_input_fh, 'wb')
+        _input = '\n'.join((' '.join(x) for x in sentences))
+        if isinstance(_input, text_type) and encoding:
             _input = _input.encode(encoding)
         _input_fh.write(_input)
         _input_fh.close()
@@ -130,7 +132,7 @@ class StanfordTagger(TaggerI):
             sentence = []
             for tagged_word in tagged_sentence.strip().split():
                 word_tags = tagged_word.strip().split(self._SEPARATOR)
-                sentence.append(("".join(word_tags[:-1]), word_tags[-1]))
+                sentence.append((''.join(word_tags[:-1]), word_tags[-1]))
             tagged_sentences.append(sentence)
         return tagged_sentences
 
@@ -151,8 +153,8 @@ class StanfordPOSTagger(StanfordTagger):
         [('What', 'WP'), ('is', 'VBZ'), ('the', 'DT'), ('airspeed', 'NN'), ('of', 'IN'), ('an', 'DT'), ('unladen', 'JJ'), ('swallow', 'VB'), ('?', '.')]
     """
 
-    _SEPARATOR = "_"
-    _JAR = "stanford-postagger.jar"
+    _SEPARATOR = '_'
+    _JAR = 'stanford-postagger.jar'
 
     def __init__(self, *args, **kwargs):
         super(StanfordPOSTagger, self).__init__(*args, **kwargs)
@@ -160,15 +162,15 @@ class StanfordPOSTagger(StanfordTagger):
     @property
     def _cmd(self):
         return [
-            "edu.stanford.nlp.tagger.maxent.MaxentTagger",
-            "-model",
+            'edu.stanford.nlp.tagger.maxent.MaxentTagger',
+            '-model',
             self._stanford_model,
-            "-textFile",
+            '-textFile',
             self._input_file_path,
-            "-tokenize",
-            "false",
-            "-outputFormatOptions",
-            "keepEmptySentences",
+            '-tokenize',
+            'false',
+            '-outputFormatOptions',
+            'keepEmptySentences',
         ]
 
 
@@ -191,9 +193,9 @@ class StanfordNERTagger(StanfordTagger):
          ('University', 'ORGANIZATION'), ('in', 'O'), ('NY', 'LOCATION')]
     """
 
-    _SEPARATOR = "/"
-    _JAR = "stanford-ner.jar"
-    _FORMAT = "slashTags"
+    _SEPARATOR = '/'
+    _JAR = 'stanford-ner.jar'
+    _FORMAT = 'slashTags'
 
     def __init__(self, *args, **kwargs):
         super(StanfordNERTagger, self).__init__(*args, **kwargs)
@@ -202,27 +204,27 @@ class StanfordNERTagger(StanfordTagger):
     def _cmd(self):
         # Adding -tokenizerFactory edu.stanford.nlp.process.WhitespaceTokenizer -tokenizerOptions tokenizeNLs=false for not using stanford Tokenizer
         return [
-            "edu.stanford.nlp.ie.crf.CRFClassifier",
-            "-loadClassifier",
+            'edu.stanford.nlp.ie.crf.CRFClassifier',
+            '-loadClassifier',
             self._stanford_model,
-            "-textFile",
+            '-textFile',
             self._input_file_path,
-            "-outputFormat",
+            '-outputFormat',
             self._FORMAT,
-            "-tokenizerFactory",
-            "edu.stanford.nlp.process.WhitespaceTokenizer",
-            "-tokenizerOptions",
-            '"tokenizeNLs=false"',
+            '-tokenizerFactory',
+            'edu.stanford.nlp.process.WhitespaceTokenizer',
+            '-tokenizerOptions',
+            '\"tokenizeNLs=false\"',
         ]
 
     def parse_output(self, text, sentences):
-        if self._FORMAT == "slashTags":
+        if self._FORMAT == 'slashTags':
             # Joint together to a big list
             tagged_sentences = []
             for tagged_sentence in text.strip().split("\n"):
                 for tagged_word in tagged_sentence.strip().split():
                     word_tags = tagged_word.strip().split(self._SEPARATOR)
-                    tagged_sentences.append(("".join(word_tags[:-1]), word_tags[-1]))
+                    tagged_sentences.append((''.join(word_tags[:-1]), word_tags[-1]))
 
             # Separate it according to the input
             result = []
@@ -239,9 +241,9 @@ def setup_module(module):
     from nose import SkipTest
 
     try:
-        StanfordPOSTagger("english-bidirectional-distsim.tagger")
+        StanfordPOSTagger('english-bidirectional-distsim.tagger')
     except LookupError:
         raise SkipTest(
-            "Doctests from nltk.tag.stanford are skipped because one \
-                       of the stanford jars cannot be found."
+            'Doctests from nltk.tag.stanford are skipped because one \
+                       of the stanford jars cannot be found.'
         )
index eb2ce12..4837e11 100644 (file)
@@ -1,18 +1,18 @@
 # Natural Language Toolkit: TnT Tagger
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Sam Huston <sjh900@gmail.com>
 #
 # URL: <http://nltk.org/>
 # For license information, see LICENSE.TXT
 
-"""
+'''
 Implementation of 'TnT - A Statisical Part of Speech Tagger'
 by Thorsten Brants
 
 http://acl.ldc.upenn.edu/A/A00/A00-1031.pdf
-"""
-
+'''
+from __future__ import print_function, division
 from math import log
 
 from operator import itemgetter
@@ -22,7 +22,7 @@ from nltk.tag.api import TaggerI
 
 
 class TnT(TaggerI):
-    """
+    '''
     TnT - Statistical POS tagger
 
     IMPORTANT NOTES:
@@ -81,10 +81,10 @@ class TnT(TaggerI):
     It is possible to differentiate the tags which are assigned to
     capitalized words. However this does not result in a significant
     gain in the accuracy of the results.
-    """
+    '''
 
     def __init__(self, unk=None, Trained=False, N=1000, C=False):
-        """
+        '''
         Construct a TnT statistical tagger. Tagger must be trained
         before being used to tag input.
 
@@ -111,7 +111,7 @@ class TnT(TaggerI):
         information for tagging.
         NOTE: using capitalization may not increase the accuracy
         of the tagger
-        """
+        '''
 
         self._uni = FreqDist()
         self._bi = ConditionalFreqDist()
@@ -132,14 +132,14 @@ class TnT(TaggerI):
         self.known = 0
 
     def train(self, data):
-        """
+        '''
         Uses a set of tagged data to train the tagger.
         If an unknown word tagger is specified,
         it is trained on the same data.
 
         :param data: List of lists of (word, tag) tuples
         :type data: tuple(str)
-        """
+        '''
 
         # Ensure that local C flag is initialized before use
         C = False
@@ -148,7 +148,7 @@ class TnT(TaggerI):
             self._unk.train(data)
 
         for sent in data:
-            history = [("BOS", False), ("BOS", False)]
+            history = [('BOS', False), ('BOS', False)]
             for w, t in sent:
 
                 # if capitalization is requested,
@@ -168,13 +168,17 @@ class TnT(TaggerI):
                 # set local flag C to false for the next word
                 C = False
 
-            self._eos[t]["EOS"] += 1
+            self._eos[t]['EOS'] += 1
 
         # compute lambda values from the trained frequency distributions
         self._compute_lambda()
 
+        # (debugging -- ignore or delete me)
+        # print "lambdas"
+        # print i, self._l1, i, self._l2, i, self._l3
+
     def _compute_lambda(self):
-        """
+        '''
         creates lambda values based upon training data
 
         NOTE: no need to explicitly reference C,
@@ -191,7 +195,7 @@ class TnT(TaggerI):
         ISSUES -- Resolutions:
         if 2 values are equal, increment both lambda values
         by (f(t1,t2,t3) / 2)
-        """
+        '''
 
         # temporary lambda variables
         tl1 = 0.0
@@ -246,6 +250,7 @@ class TnT(TaggerI):
                 # otherwise there might be a problem
                 # eg: all values = 0
                 else:
+                    # print "Problem", c1, c2 ,c3
                     pass
 
         # Lambda normalisation:
@@ -255,17 +260,17 @@ class TnT(TaggerI):
         self._l3 = tl3 / (tl1 + tl2 + tl3)
 
     def _safe_div(self, v1, v2):
-        """
+        '''
         Safe floating point division function, does not allow division by 0
         returns -1 if the denominator is 0
-        """
+        '''
         if v2 == 0:
             return -1
         else:
             return v1 / v2
 
     def tagdata(self, data):
-        """
+        '''
         Tags each sentence in a list of sentences
 
         :param data:list of list of words
@@ -275,7 +280,7 @@ class TnT(TaggerI):
         Invokes tag(sent) function for each sentence
         compiles the results into a list of tagged sentences
         each tagged sentence is a list of (word, tag) tuples
-        """
+        '''
         res = []
         for sent in data:
             res1 = self.tag(sent)
@@ -283,7 +288,7 @@ class TnT(TaggerI):
         return res
 
     def tag(self, data):
-        """
+        '''
         Tags a single sentence
 
         :param data: list of words
@@ -298,9 +303,9 @@ class TnT(TaggerI):
         with the correct words in the input sequence
 
         returns a list of (word, tag) tuples
-        """
+        '''
 
-        current_state = [(["BOS", "BOS"], 0.0)]
+        current_state = [(['BOS', 'BOS'], 0.0)]
 
         sent = list(data)
 
@@ -315,7 +320,7 @@ class TnT(TaggerI):
         return res
 
     def _tagword(self, sent, current_states):
-        """
+        '''
         :param sent : List of words remaining in the sentence
         :type sent  : [word,]
         :param current_states : List of possible tag combinations for
@@ -328,7 +333,7 @@ class TnT(TaggerI):
 
         Uses formula specified above to calculate the probability
         of a particular tag
-        """
+        '''
 
         # if this word marks the end of the sentance,
         # return the most probable tag
@@ -381,7 +386,7 @@ class TnT(TaggerI):
             # if no unknown word tagger has been specified
             # then use the tag 'Unk'
             if self._unk is None:
-                tag = ("Unk", C)
+                tag = ('Unk', C)
 
             # otherwise apply the unknown word tagger
             else:
@@ -415,7 +420,7 @@ class TnT(TaggerI):
 
 
 def basic_sent_chop(data, raw=True):
-    """
+    '''
     Basic method for tokenizing input into sentences
     for this tagger:
 
@@ -437,11 +442,11 @@ def basic_sent_chop(data, raw=True):
 
     This is a simple method which enhances the performance of the TnT
     tagger. Better sentence tokenization will further enhance the results.
-    """
+    '''
 
     new_data = []
     curr_sent = []
-    sent_mark = [",", ".", "?", "!"]
+    sent_mark = [',', '.', '?', '!']
 
     if raw:
         for word in data:
@@ -469,16 +474,19 @@ def demo():
     sents = list(brown.tagged_sents())
     test = list(brown.sents())
 
+    # create and train the tagger
     tagger = TnT()
     tagger.train(sents[200:1000])
 
+    # tag some data
     tagged_data = tagger.tagdata(test[100:120])
 
+    # print results
     for j in range(len(tagged_data)):
         s = tagged_data[j]
         t = sents[j + 100]
         for i in range(len(s)):
-            print(s[i], "--", t[i])
+            print(s[i], '--', t[i])
         print()
 
 
@@ -499,11 +507,11 @@ def demo2():
         t.unknown = 0
         t.known = 0
 
-        print("Capitalization off:")
-        print("Accuracy:", tacc)
-        print("Percentage known:", tp_kn)
-        print("Percentage unknown:", tp_un)
-        print("Accuracy over known words:", (tacc / tp_kn))
+        print('Capitalization off:')
+        print('Accuracy:', tacc)
+        print('Percentage known:', tp_kn)
+        print('Percentage unknown:', tp_un)
+        print('Accuracy over known words:', (tacc / tp_kn))
 
         sacc = s.evaluate(d[i * 100 : ((i + 1) * 100)])
         sp_un = s.unknown / (s.known + s.unknown)
@@ -511,11 +519,11 @@ def demo2():
         s.unknown = 0
         s.known = 0
 
-        print("Capitalization on:")
-        print("Accuracy:", sacc)
-        print("Percentage known:", sp_kn)
-        print("Percentage unknown:", sp_un)
-        print("Accuracy over known words:", (sacc / sp_kn))
+        print('Capitalization on:')
+        print('Accuracy:', sacc)
+        print('Percentage known:', sp_kn)
+        print('Percentage unknown:', sp_un)
+        print('Accuracy over known words:', (sacc / sp_kn))
 
 
 def demo3():
@@ -570,7 +578,7 @@ def demo3():
         tallacc += tacc
         sallacc += sacc
 
-        # print(i+1, (tacc / tp_kn), i+1, (sacc / tp_kn), i+1, tacc, i+1, sacc)
+        # print i+1, (tacc / tp_kn), i+1, (sacc / tp_kn), i+1, tacc, i+1, sacc
 
     print("brown: acc over words known:", 10 * tknacc)
     print("     : overall accuracy:", 10 * tallacc)
index 9d2172e..2a397d0 100644 (file)
@@ -1,13 +1,13 @@
 # Natural Language Toolkit: Tagger Utilities
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Edward Loper <edloper@gmail.com>
 #         Steven Bird <stevenbird1@gmail.com>
 # URL: <http://nltk.org/>
 # For license information, see LICENSE.TXT
 
 
-def str2tuple(s, sep="/"):
+def str2tuple(s, sep='/'):
     """
     Given the string representation of a tagged token, return the
     corresponding tuple representation.  The rightmost occurrence of
@@ -31,7 +31,7 @@ def str2tuple(s, sep="/"):
         return (s, None)
 
 
-def tuple2str(tagged_token, sep="/"):
+def tuple2str(tagged_token, sep='/'):
     """
     Given the tuple representation of a tagged token, return the
     corresponding string representation.  This representation is
@@ -54,8 +54,8 @@ def tuple2str(tagged_token, sep="/"):
     if tag is None:
         return word
     else:
-        assert sep not in tag, "tag may not contain sep!"
-        return "%s%s%s" % (word, sep, tag)
+        assert sep not in tag, 'tag may not contain sep!'
+        return '%s%s%s' % (word, sep, tag)
 
 
 def untag(tagged_sentence):
index dca2b46..5298a5a 100644 (file)
@@ -1,7 +1,7 @@
 # -*- coding: utf-8 -*-
 # Natural Language Toolkit: Transformation-based learning
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Marcus Uneson <marcus.uneson@gmail.com>
 #   based on previous (nltk2) version by
 #   Christopher Maloof, Edward Loper, Steven Bird
index f6656e4..80f7d82 100644 (file)
Binary files a/nlp_resource_data/nltk/tbl/__pycache__/__init__.cpython-37.pyc and b/nlp_resource_data/nltk/tbl/__pycache__/__init__.cpython-37.pyc differ
index 06de1f0..9d6ef73 100644 (file)
Binary files a/nlp_resource_data/nltk/tbl/__pycache__/api.cpython-37.pyc and b/nlp_resource_data/nltk/tbl/__pycache__/api.cpython-37.pyc differ
index fe124f3..ba75504 100644 (file)
Binary files a/nlp_resource_data/nltk/tbl/__pycache__/demo.cpython-37.pyc and b/nlp_resource_data/nltk/tbl/__pycache__/demo.cpython-37.pyc differ
index 5db4cae..0d7de8d 100644 (file)
Binary files a/nlp_resource_data/nltk/tbl/__pycache__/erroranalysis.cpython-37.pyc and b/nlp_resource_data/nltk/tbl/__pycache__/erroranalysis.cpython-37.pyc differ
index 7b886bc..908884c 100644 (file)
Binary files a/nlp_resource_data/nltk/tbl/__pycache__/feature.cpython-37.pyc and b/nlp_resource_data/nltk/tbl/__pycache__/feature.cpython-37.pyc differ
index de847c9..841cb57 100644 (file)
Binary files a/nlp_resource_data/nltk/tbl/__pycache__/rule.cpython-37.pyc and b/nlp_resource_data/nltk/tbl/__pycache__/rule.cpython-37.pyc differ
index 6393690..4a2ce0c 100644 (file)
Binary files a/nlp_resource_data/nltk/tbl/__pycache__/template.cpython-37.pyc and b/nlp_resource_data/nltk/tbl/__pycache__/template.cpython-37.pyc differ
index da30446..28642ae 100644 (file)
@@ -1,13 +1,14 @@
 # -*- coding: utf-8 -*-
 # Natural Language Toolkit: Transformation-based learning
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Marcus Uneson <marcus.uneson@gmail.com>
 #   based on previous (nltk2) version by
 #   Christopher Maloof, Edward Loper, Steven Bird
 # URL: <http://nltk.org/>
 # For license information, see  LICENSE.TXT
 
+from __future__ import print_function, absolute_import, division
 import os
 import pickle
 
@@ -248,7 +249,7 @@ def postag(
             baseline_tagger = UnigramTagger(
                 baseline_data, backoff=baseline_backoff_tagger
             )
-            with open(cache_baseline_tagger, "w") as print_rules:
+            with open(cache_baseline_tagger, 'w') as print_rules:
                 pickle.dump(baseline_tagger, print_rules)
             print(
                 "Trained baseline tagger, pickled it to {0}".format(
@@ -316,17 +317,17 @@ def postag(
 
     # writing error analysis to file
     if error_output is not None:
-        with open(error_output, "w") as f:
-            f.write("Errors for Brill Tagger %r\n\n" % serialize_output)
+        with open(error_output, 'w') as f:
+            f.write('Errors for Brill Tagger %r\n\n' % serialize_output)
             f.write(
-                u"\n".join(error_list(gold_data, taggedtest)).encode("utf-8") + "\n"
+                u'\n'.join(error_list(gold_data, taggedtest)).encode('utf-8') + '\n'
             )
         print("Wrote tagger errors including context to {0}".format(error_output))
 
     # serializing the tagger to a pickle file and reloading (just to see it works)
     if serialize_output is not None:
         taggedtest = brill_tagger.tag_sents(testing_data)
-        with open(serialize_output, "w") as print_rules:
+        with open(serialize_output, 'w') as print_rules:
             pickle.dump(brill_tagger, print_rules)
         print("Wrote pickled tagger to {0}".format(serialize_output))
         with open(serialize_output, "r") as print_rules:
@@ -380,15 +381,15 @@ def _demo_prepare_data(
 
 
 def _demo_plot(learning_curve_output, teststats, trainstats=None, take=None):
-    testcurve = [teststats["initialerrors"]]
-    for rulescore in teststats["rulescores"]:
+    testcurve = [teststats['initialerrors']]
+    for rulescore in teststats['rulescores']:
         testcurve.append(testcurve[-1] - rulescore)
-    testcurve = [1 - x / teststats["tokencount"] for x in testcurve[:take]]
+    testcurve = [1 - x / teststats['tokencount'] for x in testcurve[:take]]
 
-    traincurve = [trainstats["initialerrors"]]
-    for rulescore in trainstats["rulescores"]:
+    traincurve = [trainstats['initialerrors']]
+    for rulescore in trainstats['rulescores']:
         traincurve.append(traincurve[-1] - rulescore)
-    traincurve = [1 - x / trainstats["tokencount"] for x in traincurve[:take]]
+    traincurve = [1 - x / trainstats['tokencount'] for x in traincurve[:take]]
 
     import matplotlib.pyplot as plt
 
@@ -398,19 +399,19 @@ def _demo_plot(learning_curve_output, teststats, trainstats=None, take=None):
     plt.savefig(learning_curve_output)
 
 
-NN_CD_TAGGER = RegexpTagger([(r"^-?[0-9]+(.[0-9]+)?$", "CD"), (r".*", "NN")])
+NN_CD_TAGGER = RegexpTagger([(r'^-?[0-9]+(.[0-9]+)?$', 'CD'), (r'.*', 'NN')])
 
 REGEXP_TAGGER = RegexpTagger(
     [
-        (r"^-?[0-9]+(.[0-9]+)?$", "CD"),  # cardinal numbers
-        (r"(The|the|A|a|An|an)$", "AT"),  # articles
-        (r".*able$", "JJ"),  # adjectives
-        (r".*ness$", "NN"),  # nouns formed from adjectives
-        (r".*ly$", "RB"),  # adverbs
-        (r".*s$", "NNS"),  # plural nouns
-        (r".*ing$", "VBG"),  # gerunds
-        (r".*ed$", "VBD"),  # past tense verbs
-        (r".*", "NN"),  # nouns (default)
+        (r'^-?[0-9]+(.[0-9]+)?$', 'CD'),  # cardinal numbers
+        (r'(The|the|A|a|An|an)$', 'AT'),  # articles
+        (r'.*able$', 'JJ'),  # adjectives
+        (r'.*ness$', 'NN'),  # nouns formed from adjectives
+        (r'.*ly$', 'RB'),  # adverbs
+        (r'.*s$', 'NNS'),  # plural nouns
+        (r'.*ing$', 'VBG'),  # gerunds
+        (r'.*ed$', 'VBD'),  # past tense verbs
+        (r'.*', 'NN'),  # nouns (default)
     ]
 )
 
@@ -419,5 +420,5 @@ def corpus_size(seqs):
     return (len(seqs), sum(len(x) for x in seqs))
 
 
-if __name__ == "__main__":
+if __name__ == '__main__':
     demo_learning_curve()
index 9c0881a..c25d33d 100644 (file)
@@ -1,13 +1,16 @@
 # -*- coding: utf-8 -*-
 # Natural Language Toolkit: Transformation-based learning
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Marcus Uneson <marcus.uneson@gmail.com>
 #   based on previous (nltk2) version by
 #   Christopher Maloof, Edward Loper, Steven Bird
 # URL: <http://nltk.org/>
 # For license information, see  LICENSE.TXT
 
+from __future__ import print_function
+
+
 # returns a list of errors in string format
 
 
@@ -21,21 +24,21 @@ def error_list(train_sents, test_sents):
     :param test_sents: The tagged corpus
     :type test_sents: list(tuple)
     """
-    hdr = ("%25s | %s | %s\n" + "-" * 26 + "+" + "-" * 24 + "+" + "-" * 26) % (
-        "left context",
-        "word/test->gold".center(22),
-        "right context",
+    hdr = ('%25s | %s | %s\n' + '-' * 26 + '+' + '-' * 24 + '+' + '-' * 26) % (
+        'left context',
+        'word/test->gold'.center(22),
+        'right context',
     )
     errors = [hdr]
     for (train_sent, test_sent) in zip(train_sents, test_sents):
         for wordnum, (word, train_pos) in enumerate(train_sent):
             test_pos = test_sent[wordnum][1]
             if train_pos != test_pos:
-                left = " ".join("%s/%s" % w for w in train_sent[:wordnum])
-                right = " ".join("%s/%s" % w for w in train_sent[wordnum + 1 :])
-                mid = "%s/%s->%s" % (word, test_pos, train_pos)
+                left = ' '.join('%s/%s' % w for w in train_sent[:wordnum])
+                right = ' '.join('%s/%s' % w for w in train_sent[wordnum + 1 :])
+                mid = '%s/%s->%s' % (word, test_pos, train_pos)
                 errors.append(
-                    "%25s | %s | %s" % (left[-25:], mid.center(22), right[:25])
+                    '%25s | %s | %s' % (left[-25:], mid.center(22), right[:25])
                 )
 
     return errors
index 9a5bb00..d9c6715 100644 (file)
@@ -1,17 +1,20 @@
 # -*- coding: utf-8 -*-
 # Natural Language Toolkit: Transformation-based learning
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Marcus Uneson <marcus.uneson@gmail.com>
 #   based on previous (nltk2) version by
 #   Christopher Maloof, Edward Loper, Steven Bird
 # URL: <http://nltk.org/>
 # For license information, see  LICENSE.TXT
 
+from __future__ import division, print_function, unicode_literals
 from abc import ABCMeta, abstractmethod
+from six import add_metaclass
 
 
-class Feature(metaclass=ABCMeta):
+@add_metaclass(ABCMeta)
+class Feature(object):
     """
     An abstract base class for Features. A Feature is a combination of
     a specific property-computing method and a list of relative positions
@@ -30,7 +33,7 @@ class Feature(metaclass=ABCMeta):
 
     """
 
-    json_tag = "nltk.tbl.Feature"
+    json_tag = 'nltk.tbl.Feature'
     PROPERTY_NAME = None
 
     def __init__(self, positions, end=None):
index 3c872f8..6d70954 100644 (file)
@@ -1,22 +1,26 @@
 # -*- coding: utf-8 -*-
 # Natural Language Toolkit: Transformation-based learning
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Marcus Uneson <marcus.uneson@gmail.com>
 #   based on previous (nltk2) version by
 #   Christopher Maloof, Edward Loper, Steven Bird
 # URL: <http://nltk.org/>
 # For license information, see  LICENSE.TXT
 
+from __future__ import print_function
 from abc import ABCMeta, abstractmethod
+from six import add_metaclass
 
+from nltk.compat import python_2_unicode_compatible, unicode_repr
 from nltk import jsontags
 
 
 ######################################################################
 # Tag Rules
 ######################################################################
-class TagRule(metaclass=ABCMeta):
+@add_metaclass(ABCMeta)
+class TagRule(object):
     """
     An interface for tag transformations on a tagged corpus, as
     performed by tbl taggers.  Each transformation finds all tokens
@@ -92,6 +96,7 @@ class TagRule(metaclass=ABCMeta):
         raise TypeError("Rules must implement __hash__()")
 
 
+@python_2_unicode_compatible
 @jsontags.register_tag
 class Rule(TagRule):
     """
@@ -112,7 +117,7 @@ class Rule(TagRule):
 
     """
 
-    json_tag = "nltk.tbl.Rule"
+    json_tag = 'nltk.tbl.Rule'
 
     def __init__(self, templateid, original_tag, replacement_tag, conditions):
         """
@@ -137,19 +142,16 @@ class Rule(TagRule):
 
     def encode_json_obj(self):
         return {
-            "templateid": self.templateid,
-            "original": self.original_tag,
-            "replacement": self.replacement_tag,
-            "conditions": self._conditions,
+            'templateid': self.templateid,
+            'original': self.original_tag,
+            'replacement': self.replacement_tag,
+            'conditions': self._conditions,
         }
 
     @classmethod
     def decode_json_obj(cls, obj):
         return cls(
-            obj["templateid"],
-            obj["original"],
-            obj["replacement"],
-            tuple(tuple(feat) for feat in obj["conditions"])
+            obj['templateid'], obj['original'], obj['replacement'], obj['conditions']
         )
 
     def applies(self, tokens, index):
@@ -205,12 +207,12 @@ class Rule(TagRule):
             self.__repr = "{0}('{1}', {2}, {3}, [{4}])".format(
                 self.__class__.__name__,
                 self.templateid,
-                repr(self.original_tag),
-                repr(self.replacement_tag),
+                unicode_repr(self.original_tag),
+                unicode_repr(self.replacement_tag),
                 # list(self._conditions) would be simpler but will not generate
                 # the same Rule.__repr__ in python 2 and 3 and thus break some tests
-                ", ".join(
-                    "({0},{1})".format(f, repr(v))
+                ', '.join(
+                    "({0},{1})".format(f, unicode_repr(v))
                     for (f, v) in self._conditions
                 ),
             )
@@ -223,16 +225,16 @@ class Rule(TagRule):
             Return a compact, predicate-logic styled string representation
             of the given condition.
             """
-            return "{0}:{1}@[{2}]".format(
+            return '{0}:{1}@[{2}]'.format(
                 feature.PROPERTY_NAME,
                 value,
                 ",".join(str(w) for w in feature.positions),
             )
 
-        conditions = " & ".join(
+        conditions = ' & '.join(
             [_condition_to_logic(f, v) for (f, v) in self._conditions]
         )
-        s = "{0}->{1} if {2}".format(
+        s = '{0}->{1} if {2}'.format(
             self.original_tag, self.replacement_tag, conditions
         )
 
@@ -301,26 +303,26 @@ class Rule(TagRule):
             if len(positions) == 1:
                 p = positions[0]
                 if p == 0:
-                    return "this word"
+                    return 'this word'
                 if p == -1:
-                    return "the preceding word"
+                    return 'the preceding word'
                 elif p == 1:
-                    return "the following word"
+                    return 'the following word'
                 elif p < 0:
-                    return "word i-%d" % -p
+                    return 'word i-%d' % -p
                 elif p > 0:
-                    return "word i+%d" % p
+                    return 'word i+%d' % p
             else:
                 # for complete compatibility with the wordy format of nltk2
                 mx = max(positions)
                 mn = min(positions)
                 if mx - mn == len(positions) - 1:
-                    return "words i%+d...i%+d" % (mn, mx)
+                    return 'words i%+d...i%+d' % (mn, mx)
                 else:
-                    return "words {%s}" % (",".join("i%+d" % d for d in positions),)
+                    return 'words {%s}' % (",".join("i%+d" % d for d in positions),)
 
-        replacement = "%s -> %s" % (self.original_tag, self.replacement_tag)
-        conditions = (" if " if self._conditions else "") + ", and ".join(
+        replacement = '%s -> %s' % (self.original_tag, self.replacement_tag)
+        conditions = (' if ' if self._conditions else "") + ', and '.join(
             condition_to_str(f, v) for (f, v) in self._conditions
         )
         return replacement + conditions
index 06ddff0..b0556ed 100644 (file)
@@ -1,20 +1,23 @@
 # -*- coding: utf-8 -*-
 # Natural Language Toolkit: Transformation-based learning
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Marcus Uneson <marcus.uneson@gmail.com>
 #   based on previous (nltk2) version by
 #   Christopher Maloof, Edward Loper, Steven Bird
 # URL: <http://nltk.org/>
 # For license information, see  LICENSE.TXT
 
+from __future__ import print_function
 from abc import ABCMeta, abstractmethod
+from six import add_metaclass
 import itertools as it
 from nltk.tbl.feature import Feature
 from nltk.tbl.rule import Rule
 
 
-class BrillTemplateI(metaclass=ABCMeta):
+@add_metaclass(ABCMeta)
+class BrillTemplateI(object):
     """
     An interface for generating lists of transformational rules that
     apply at given sentence positions.  ``BrillTemplateI`` is used by
index 639b0b1..107774e 100644 (file)
@@ -1,6 +1,6 @@
 # Natural Language Toolkit: Unit Tests
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Edward Loper <edloper@gmail.com>
 # URL: <http://nltk.org/>
 # For license information, see LICENSE.TXT
index aa37dc5..319c440 100644 (file)
Binary files a/nlp_resource_data/nltk/test/__pycache__/__init__.cpython-37.pyc and b/nlp_resource_data/nltk/test/__pycache__/__init__.cpython-37.pyc differ
index f4967ba..ee3607d 100644 (file)
Binary files a/nlp_resource_data/nltk/test/__pycache__/all.cpython-37.pyc and b/nlp_resource_data/nltk/test/__pycache__/all.cpython-37.pyc differ
index 129e949..7c11dc1 100644 (file)
Binary files a/nlp_resource_data/nltk/test/__pycache__/childes_fixt.cpython-37.pyc and b/nlp_resource_data/nltk/test/__pycache__/childes_fixt.cpython-37.pyc differ
index 04e8a4e..d35f0e4 100644 (file)
Binary files a/nlp_resource_data/nltk/test/__pycache__/classify_fixt.cpython-37.pyc and b/nlp_resource_data/nltk/test/__pycache__/classify_fixt.cpython-37.pyc differ
diff --git a/nlp_resource_data/nltk/test/__pycache__/compat_fixt.cpython-37.pyc b/nlp_resource_data/nltk/test/__pycache__/compat_fixt.cpython-37.pyc
new file mode 100644 (file)
index 0000000..0dfa362
Binary files /dev/null and b/nlp_resource_data/nltk/test/__pycache__/compat_fixt.cpython-37.pyc differ
index 7bed297..4da5b4e 100644 (file)
Binary files a/nlp_resource_data/nltk/test/__pycache__/corpus_fixt.cpython-37.pyc and b/nlp_resource_data/nltk/test/__pycache__/corpus_fixt.cpython-37.pyc differ
index ee3454e..f6b3b4f 100644 (file)
Binary files a/nlp_resource_data/nltk/test/__pycache__/discourse_fixt.cpython-37.pyc and b/nlp_resource_data/nltk/test/__pycache__/discourse_fixt.cpython-37.pyc differ
diff --git a/nlp_resource_data/nltk/test/__pycache__/doctest_nose_plugin.cpython-37.pyc b/nlp_resource_data/nltk/test/__pycache__/doctest_nose_plugin.cpython-37.pyc
new file mode 100644 (file)
index 0000000..4e9a518
Binary files /dev/null and b/nlp_resource_data/nltk/test/__pycache__/doctest_nose_plugin.cpython-37.pyc differ
index 1955f4d..08ccf6c 100644 (file)
Binary files a/nlp_resource_data/nltk/test/__pycache__/gensim_fixt.cpython-37.pyc and b/nlp_resource_data/nltk/test/__pycache__/gensim_fixt.cpython-37.pyc differ
index 97e9ae8..ff389c2 100644 (file)
Binary files a/nlp_resource_data/nltk/test/__pycache__/gluesemantics_malt_fixt.cpython-37.pyc and b/nlp_resource_data/nltk/test/__pycache__/gluesemantics_malt_fixt.cpython-37.pyc differ
index 55fe85f..3850b9a 100644 (file)
Binary files a/nlp_resource_data/nltk/test/__pycache__/inference_fixt.cpython-37.pyc and b/nlp_resource_data/nltk/test/__pycache__/inference_fixt.cpython-37.pyc differ
index cbbc4e0..4171a7e 100644 (file)
Binary files a/nlp_resource_data/nltk/test/__pycache__/nonmonotonic_fixt.cpython-37.pyc and b/nlp_resource_data/nltk/test/__pycache__/nonmonotonic_fixt.cpython-37.pyc differ
index f063665..e888c66 100644 (file)
Binary files a/nlp_resource_data/nltk/test/__pycache__/portuguese_en_fixt.cpython-37.pyc and b/nlp_resource_data/nltk/test/__pycache__/portuguese_en_fixt.cpython-37.pyc differ
index e7dacd8..b8f9560 100644 (file)
Binary files a/nlp_resource_data/nltk/test/__pycache__/probability_fixt.cpython-37.pyc and b/nlp_resource_data/nltk/test/__pycache__/probability_fixt.cpython-37.pyc differ
index 9cc299b..37d94b2 100644 (file)
Binary files a/nlp_resource_data/nltk/test/__pycache__/runtests.cpython-37.pyc and b/nlp_resource_data/nltk/test/__pycache__/runtests.cpython-37.pyc differ
index 41dbea1..d703bab 100644 (file)
Binary files a/nlp_resource_data/nltk/test/__pycache__/segmentation_fixt.cpython-37.pyc and b/nlp_resource_data/nltk/test/__pycache__/segmentation_fixt.cpython-37.pyc differ
index 98f8e27..9d1d2e1 100644 (file)
Binary files a/nlp_resource_data/nltk/test/__pycache__/semantics_fixt.cpython-37.pyc and b/nlp_resource_data/nltk/test/__pycache__/semantics_fixt.cpython-37.pyc differ
index b870953..c1e47e5 100644 (file)
Binary files a/nlp_resource_data/nltk/test/__pycache__/translate_fixt.cpython-37.pyc and b/nlp_resource_data/nltk/test/__pycache__/translate_fixt.cpython-37.pyc differ
index b4224b9..3362534 100644 (file)
Binary files a/nlp_resource_data/nltk/test/__pycache__/wordnet_fixt.cpython-37.pyc and b/nlp_resource_data/nltk/test/__pycache__/wordnet_fixt.cpython-37.pyc differ
index 5844a39..c48e52a 100644 (file)
@@ -12,10 +12,10 @@ import os.path
 
 
 def additional_tests():
-    # print("here-000000000000000")
-    # print("-----", glob(os.path.join(os.path.dirname(__file__), '*.doctest')))
+    # print "here-000000000000000"
+    # print "-----", glob(os.path.join(os.path.dirname(__file__), '*.doctest'))
     dir = os.path.dirname(__file__)
-    paths = glob(os.path.join(dir, "*.doctest"))
+    paths = glob(os.path.join(dir, '*.doctest'))
     files = [os.path.basename(path) for path in paths]
     return unittest.TestSuite([doctest.DocFileSuite(file) for file in files])
 
index 4b27cde..e16f8a1 100644 (file)
@@ -1,4 +1,4 @@
-.. Copyright (C) 2001-2020 NLTK Project
+.. Copyright (C) 2001-2019 NLTK Project
 .. For license information, see LICENSE.TXT
 
     >>> import os.path
index acc29d5..cc0ad49 100644 (file)
@@ -1,4 +1,4 @@
-.. Copyright (C) 2001-2020 NLTK Project
+.. Copyright (C) 2001-2019 NLTK Project
 .. For license information, see LICENSE.TXT
 
 ==============================
@@ -282,7 +282,7 @@ Unicode words are supported.
 
 Lexicons for the tests:
 
-    >>> lex = lexicon.parseLexicon('''
+    >>> lex = lexicon.parseLexicon(u'''
     ...        :- S, N, NP, PP
     ...
     ...        AdjI :: N\\N
index 450e78e..ce62733 100644 (file)
@@ -1,4 +1,4 @@
-.. Copyright (C) 2001-2020 NLTK Project
+.. Copyright (C) 2001-2019 NLTK Project
 .. For license information, see LICENSE.TXT
 
 ==============================================
index 50d0c42..9efe693 100644 (file)
@@ -1,4 +1,4 @@
-.. Copyright (C) 2001-2020 NLTK Project
+.. Copyright (C) 2001-2019 NLTK Project
 .. For license information, see LICENSE.TXT
 
 =======
index 312449b..04701fb 100644 (file)
@@ -1,4 +1,5 @@
 # -*- coding: utf-8 -*-
+from __future__ import absolute_import
 
 
 def setup_module(module):
@@ -6,7 +7,7 @@ def setup_module(module):
     import nltk.data
 
     try:
-        nltk.data.find("corpora/childes/data-xml/Eng-USA-MOR/")
+        nltk.data.find('corpora/childes/data-xml/Eng-USA-MOR/')
     except LookupError as e:
         print(e)
         raise SkipTest(
index ff4f157..6fd2ad7 100644 (file)
@@ -1,4 +1,4 @@
-.. Copyright (C) 2001-2020 NLTK Project
+.. Copyright (C) 2001-2019 NLTK Project
 .. For license information, see LICENSE.TXT
 
 ==========
index 26d14e6..d208084 100644 (file)
@@ -1,4 +1,4 @@
-.. Copyright (C) 2001-2020 NLTK Project
+.. Copyright (C) 2001-2019 NLTK Project
 .. For license information, see LICENSE.TXT
 
 =============
@@ -49,7 +49,6 @@ haven't done this yet for all tests).
     ...     (dict(a=0,b=1,c=0), 'x'),
     ...     (dict(a=0,b=0,c=0), 'x'),
     ...     (dict(a=0,b=1,c=1), 'y'),
-    ...     (dict(a=None,b=1,c=0), 'x'),
     ...     ]
     >>> test = [
     ...     (dict(a=1,b=0,c=1)), # unseen
@@ -67,24 +66,24 @@ Test the Naive Bayes classifier:
     ['y', 'x', 'y', 'x']
     >>> for pdist in classifier.prob_classify_many(test):
     ...     print('%.4f %.4f' % (pdist.prob('x'), pdist.prob('y')))
-    0.2500 0.7500
-    0.5833 0.4167
-    0.3571 0.6429
-    0.7000 0.3000
+    0.3203 0.6797
+    0.5857 0.4143
+    0.3792 0.6208
+    0.6470 0.3530
     >>> classifier.show_most_informative_features()
     Most Informative Features
-                           c = 0                   x : y      =      2.3 : 1.0
-                           c = 1                   y : x      =      1.8 : 1.0
-                           a = 1                   y : x      =      1.7 : 1.0
-                           a = 0                   x : y      =      1.0 : 1.0
-                           b = 0                   x : y      =      1.0 : 1.0
-                           b = 1                   x : y      =      1.0 : 1.0
+                           c = 0                   x : y      =      2.0 : 1.0
+                           c = 1                   y : x      =      1.5 : 1.0
+                           a = 1                   y : x      =      1.4 : 1.0
+                           b = 0                   x : y      =      1.2 : 1.0
+                           a = 0                   x : y      =      1.2 : 1.0
+                           b = 1                   y : x      =      1.1 : 1.0
 
-Test the Decision Tree classifier (without None):
+Test the Decision Tree classifier:
 
     >>> classifier = nltk.classify.DecisionTreeClassifier.train(
-    ...     train[:-1], entropy_cutoff=0,
-    ...     support_cutoff=0)
+    ...     train, entropy_cutoff=0,
+    ...                                                support_cutoff=0)
     >>> sorted(classifier.labels())
     ['x', 'y']
     >>> print(classifier)
@@ -100,23 +99,6 @@ Test the Decision Tree classifier (without None):
     Traceback (most recent call last):
       . . .
     NotImplementedError
-    
-    
-Test the Decision Tree classifier (with None):
-
-    >>> classifier = nltk.classify.DecisionTreeClassifier.train(
-    ...     train, entropy_cutoff=0,
-    ...     support_cutoff=0)
-    >>> sorted(classifier.labels())
-    ['x', 'y']
-    >>> print(classifier)
-    c=0? .................................................. x
-      a=0? ................................................ x
-      a=1? ................................................ y
-      a=None? ............................................. x
-    c=1? .................................................. y
-    <BLANKLINE>
-
 
 Test SklearnClassifier, which requires the scikit-learn package.
 
index b9d1496..dce0704 100644 (file)
@@ -1,4 +1,5 @@
 # -*- coding: utf-8 -*-
+from __future__ import absolute_import
 
 
 # most of classify.doctest requires numpy
index 241913c..6a67511 100644 (file)
@@ -1,4 +1,4 @@
-.. Copyright (C) 2001-2020 NLTK Project
+.. Copyright (C) 2001-2019 NLTK Project
 .. For license information, see LICENSE.TXT
 
 ===========
index b1bb33a..b4af859 100644 (file)
@@ -1,4 +1,4 @@
-.. Copyright (C) 2001-2020 NLTK Project
+.. Copyright (C) 2001-2019 NLTK Project
 .. For license information, see LICENSE.TXT
 
 ==============
@@ -16,14 +16,13 @@ measured using Pointwise Mutual Information.
     >>> from nltk.collocations import *
     >>> bigram_measures = nltk.collocations.BigramAssocMeasures()
     >>> trigram_measures = nltk.collocations.TrigramAssocMeasures()
-    >>> fourgram_measures = nltk.collocations.QuadgramAssocMeasures()
     >>> finder = BigramCollocationFinder.from_words(
     ...     nltk.corpus.genesis.words('english-web.txt'))
     >>> finder.nbest(bigram_measures.pmi, 10)  # doctest: +NORMALIZE_WHITESPACE
-    [('Allon', 'Bacuth'), ('Ashteroth', 'Karnaim'), ('Ben', 'Ammi'),
-     ('En', 'Mishpat'), ('Jegar', 'Sahadutha'), ('Salt', 'Sea'),
-     ('Whoever', 'sheds'), ('appoint', 'overseers'), ('aromatic', 'resin'),
-     ('cutting', 'instrument')]
+    [(u'Allon', u'Bacuth'), (u'Ashteroth', u'Karnaim'), (u'Ben', u'Ammi'),
+     (u'En', u'Mishpat'), (u'Jegar', u'Sahadutha'), (u'Salt', u'Sea'),
+     (u'Whoever', u'sheds'), (u'appoint', u'overseers'), (u'aromatic', u'resin'),
+     (u'cutting', u'instrument')]
 
 While these words are highly collocated, the expressions are also very
 infrequent.  Therefore it is useful to apply filters, such as ignoring all
@@ -31,10 +30,10 @@ bigrams which occur less than three times in the corpus:
 
     >>> finder.apply_freq_filter(3)
     >>> finder.nbest(bigram_measures.pmi, 10)  # doctest: +NORMALIZE_WHITESPACE
-    [('Beer', 'Lahai'), ('Lahai', 'Roi'), ('gray', 'hairs'),
-     ('Most', 'High'), ('ewe', 'lambs'), ('many', 'colors'),
-     ('burnt', 'offering'), ('Paddan', 'Aram'), ('east', 'wind'),
-     ('living', 'creature')]
+    [(u'Beer', u'Lahai'), (u'Lahai', u'Roi'), (u'gray', u'hairs'),
+     (u'Most', u'High'), (u'ewe', u'lambs'), (u'many', u'colors'),
+     (u'burnt', u'offering'), (u'Paddan', u'Aram'), (u'east', u'wind'),
+     (u'living', u'creature')]
 
 We may similarly find collocations among tagged words:
 
@@ -64,10 +63,10 @@ Or spanning intervening words:
     >>> ignored_words = nltk.corpus.stopwords.words('english')
     >>> finder.apply_word_filter(lambda w: len(w) < 3 or w.lower() in ignored_words)
     >>> finder.nbest(bigram_measures.likelihood_ratio, 10) # doctest: +NORMALIZE_WHITESPACE
-    [('chief', 'chief'), ('became', 'father'), ('years', 'became'),
-     ('hundred', 'years'), ('lived', 'became'), ('king', 'king'),
-     ('lived', 'years'), ('became', 'became'), ('chief', 'chiefs'),
-     ('hundred', 'became')]
+    [(u'chief', u'chief'), (u'became', u'father'), (u'years', u'became'),
+     (u'hundred', u'years'), (u'lived', u'became'), (u'king', u'king'),
+     (u'lived', u'years'), (u'became', u'became'), (u'chief', u'chiefs'),
+     (u'hundred', u'became')]
 
 Finders
 ~~~~~~~
@@ -127,12 +126,6 @@ A closer look at the finder's ngram frequencies:
      ((',', 'do', 'not'), 1), (('I', 'am', '!'), 1), (('Sam', 'I', '!'), 1),
      (('Sam', 'I', 'am'), 1)]
 
-A similar interface is provided for fourgrams:
-
-    >>> finder_4grams = QuadgramCollocationFinder.from_words(tokens)
-    >>> scored_4grams = finder_4grams.score_ngrams(fourgram_measures.raw_freq)
-    >>> set(fourgram for fourgram, score in scored_4grams) == set(nltk.ngrams(tokens, n=4))
-    True
 
 Filtering candidates
 ~~~~~~~~~~~~~~~~~~~~
diff --git a/nlp_resource_data/nltk/test/compat.doctest b/nlp_resource_data/nltk/test/compat.doctest
new file mode 100644 (file)
index 0000000..1d668f3
--- /dev/null
@@ -0,0 +1,134 @@
+
+=========================================
+NLTK Python 2.x - 3.x Compatibility Layer
+=========================================
+
+NLTK comes with a Python 2.x/3.x compatibility layer, nltk.compat
+(which is loosely based on `six <http://packages.python.org/six/>`_)::
+
+    >>> from nltk import compat
+    >>> compat.PY3
+    False
+    >>> # and so on
+
+@python_2_unicode_compatible
+----------------------------
+
+Under Python 2.x ``__str__`` and ``__repr__`` methods must
+return bytestrings.
+
+``@python_2_unicode_compatible`` decorator allows writing these methods
+in a way compatible with Python 3.x:
+
+1) wrap a class with this decorator,
+2) define ``__str__`` and ``__repr__`` methods returning unicode text
+   (that's what they must return under Python 3.x),
+
+and they would be fixed under Python 2.x to return byte strings::
+
+    >>> from nltk.compat import python_2_unicode_compatible
+
+    >>> @python_2_unicode_compatible
+    ... class Foo(object):
+    ...     def __str__(self):
+    ...         return u'__str__ is called'
+    ...     def __repr__(self):
+    ...         return u'__repr__ is called'
+
+    >>> foo = Foo()
+    >>> foo.__str__().__class__
+    <type 'str'>
+    >>> foo.__repr__().__class__
+    <type 'str'>
+    >>> print(foo)
+    __str__ is called
+    >>> foo
+    __repr__ is called
+
+Original versions of ``__str__`` and ``__repr__`` are available as
+``__unicode__`` and ``unicode_repr``::
+
+    >>> foo.__unicode__().__class__
+    <type 'unicode'>
+    >>> foo.unicode_repr().__class__
+    <type 'unicode'>
+    >>> unicode(foo)
+    u'__str__ is called'
+    >>> foo.unicode_repr()
+    u'__repr__ is called'
+
+There is no need to wrap a subclass with ``@python_2_unicode_compatible``
+if it doesn't override ``__str__`` and ``__repr__``::
+
+    >>> class Bar(Foo):
+    ...     pass
+    >>> bar = Bar()
+    >>> bar.__str__().__class__
+    <type 'str'>
+
+However, if a subclass overrides ``__str__`` or ``__repr__``,
+wrap it again::
+
+    >>> class BadBaz(Foo):
+    ...     def __str__(self):
+    ...         return u'Baz.__str__'
+    >>> baz = BadBaz()
+    >>> baz.__str__().__class__  # this is incorrect!
+    <type 'unicode'>
+
+    >>> @python_2_unicode_compatible
+    ... class GoodBaz(Foo):
+    ...     def __str__(self):
+    ...         return u'Baz.__str__'
+    >>> baz = GoodBaz()
+    >>> baz.__str__().__class__
+    <type 'str'>
+    >>> baz.__unicode__().__class__
+    <type 'unicode'>
+
+Applying ``@python_2_unicode_compatible`` to a subclass
+shouldn't break methods that was not overridden::
+
+    >>> baz.__repr__().__class__
+    <type 'str'>
+    >>> baz.unicode_repr().__class__
+    <type 'unicode'>
+
+unicode_repr
+------------
+
+Under Python 3.x ``repr(unicode_string)`` doesn't have a leading "u" letter.
+
+``nltk.compat.unicode_repr`` function may be used instead of ``repr`` and
+``"%r" % obj`` to make the output more consistent under Python 2.x and 3.x::
+
+    >>> from nltk.compat import unicode_repr
+    >>> print(repr(u"test"))
+    u'test'
+    >>> print(unicode_repr(u"test"))
+    'test'
+
+It may be also used to get an original unescaped repr (as unicode)
+of objects which class was fixed by ``@python_2_unicode_compatible``
+decorator::
+
+    >>> @python_2_unicode_compatible
+    ... class Foo(object):
+    ...     def __repr__(self):
+    ...         return u'<Foo: foo>'
+
+    >>> foo = Foo()
+    >>> repr(foo)
+    '<Foo: foo>'
+    >>> unicode_repr(foo)
+    u'<Foo: foo>'
+
+For other objects it returns the same value as ``repr``::
+
+    >>> unicode_repr(5)
+    '5'
+
+It may be a good idea to use ``unicode_repr`` instead of ``%r``
+string formatting specifier inside ``__repr__`` or ``__str__``
+methods of classes fixed by ``@python_2_unicode_compatible``
+to make the output consistent between Python 2.x and 3.x.
diff --git a/nlp_resource_data/nltk/test/compat_fixt.py b/nlp_resource_data/nltk/test/compat_fixt.py
new file mode 100644 (file)
index 0000000..5878d9b
--- /dev/null
@@ -0,0 +1,10 @@
+# -*- coding: utf-8 -*-
+from __future__ import absolute_import
+from nltk.compat import PY3
+
+
+def setup_module(module):
+    from nose import SkipTest
+
+    if PY3:
+        raise SkipTest("compat.doctest is for Python 2.x")
index 73b8fd7..3fa0ae6 100644 (file)
@@ -1,4 +1,4 @@
-.. Copyright (C) 2001-2020 NLTK Project
+.. Copyright (C) 2001-2019 NLTK Project
 .. For license information, see LICENSE.TXT
 
 ================
@@ -94,7 +94,7 @@ If the reader methods are called without any arguments, they will
 typically load all documents in the corpus.
 
     >>> len(inaugural.words())
-    149797
+    145735
 
 If a corpus contains a README file, it can be accessed with a ``readme()`` method:
 
@@ -109,7 +109,7 @@ Here are the first few words from each of NLTK's plaintext corpora:
     >>> nltk.corpus.abc.words()
     ['PM', 'denies', 'knowledge', 'of', 'AWB', ...]
     >>> nltk.corpus.genesis.words()
-    ['In', 'the', 'beginning', 'God', 'created', ...]
+    [u'In', u'the', u'beginning', u'God', u'created', ...]
     >>> nltk.corpus.gutenberg.words(fileids='austen-emma.txt')
     ['[', 'Emma', 'by', 'Jane', 'Austen', '1816', ...]
     >>> nltk.corpus.inaugural.words()
@@ -199,7 +199,7 @@ CoNLL 2002 Corpus includes named entity chunks.
       (NP the/DT Exchequer/NNP)
       ...)
     >>> print(conll2002.sents()) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
-    [['Sao', 'Paulo', '(', 'Brasil', ')', ',', ...], ['-'], ...]
+    [[u'Sao', u'Paulo', u'(', u'Brasil', u')', u',', ...], [u'-'], ...]
     >>> for tree in conll2002.chunked_sents()[:2]:
     ...     print(tree) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
     (S
@@ -387,8 +387,8 @@ examples illustrate the use of the wordlist corpora:
 
     >>> stopwords.fileids() # doctest: +ELLIPSIS
     ['arabic', 'azerbaijani', 'danish', 'dutch', 'english', 'finnish', 'french', ...]
-    >>> sorted(stopwords.words('portuguese')) # doctest: +ELLIPSIS
-    ['a', 'ao', 'aos', 'aquela', 'aquelas', 'aquele', 'aqueles', ...]
+    >>> stopwords.words('portuguese') # doctest: +ELLIPSIS
+    ['de', 'a', 'o', 'que', 'e', 'do', 'da', 'em', 'um', 'para', ...]
     >>> names.fileids()
     ['female.txt', 'male.txt']
     >>> names.words('male.txt') # doctest: +ELLIPSIS
@@ -595,7 +595,8 @@ We can compute stats for specific product features:
 
     >>> n_reviews = len([(feat,score) for (feat,score) in product_reviews_1.features('Canon_G3.txt') if feat=='picture'])
     >>> tot = sum([int(score) for (feat,score) in product_reviews_1.features('Canon_G3.txt') if feat=='picture'])
-    >>> mean = tot / n_reviews
+    >>> # We use float for backward compatibility with division in Python2.7
+    >>> mean = tot/float(n_reviews)
     >>> print(n_reviews, tot, mean)
     15 24 1.6
 
@@ -1215,7 +1216,7 @@ definitions of these data access methods wherever possible.
 
 At a high level, corpora can be divided into three basic types:
 
-- A *token corpus* contains information about specific occurrences of
+- A *token corpus* contains information about specific occurences of
   language use (or linguistic tokens), such as dialogues or written
   texts.  Examples of token corpora are collections of written text
   and collections of speech.
@@ -1336,9 +1337,9 @@ corpora, and returns a flat list of word strings:
     >>> nltk.corpus.treebank.words()
     ['Pierre', 'Vinken', ',', '61', 'years', 'old', ...]
     >>> nltk.corpus.conll2002.words()
-    ['Sao', 'Paulo', '(', 'Brasil', ')', ',', '23', ...]
+    [u'Sao', u'Paulo', u'(', u'Brasil', u')', u',', u'23', ...]
     >>> nltk.corpus.genesis.words()
-    ['In', 'the', 'beginning', 'God', 'created', ...]
+    [u'In', u'the', u'beginning', u'God', u'created', ...]
 
 On the other hand, the `tagged_words()` method is only supported by
 corpora that include part-of-speech annotations:
@@ -1348,7 +1349,7 @@ corpora that include part-of-speech annotations:
     >>> nltk.corpus.treebank.tagged_words()
     [('Pierre', 'NNP'), ('Vinken', 'NNP'), ...]
     >>> nltk.corpus.conll2002.tagged_words()
-    [('Sao', 'NC'), ('Paulo', 'VMI'), ('(', 'Fpa'), ...]
+    [(u'Sao', u'NC'), (u'Paulo', u'VMI'), (u'(', u'Fpa'), ...]
     >>> nltk.corpus.genesis.tagged_words()
     Traceback (most recent call last):
       ...
@@ -2017,20 +2018,20 @@ supplied by a read-only stream.  Note that all of the read operations
 return ``unicode`` objects (not ``str`` objects).
 
     >>> reader.read()         # read the entire file.
-    'This is a test file.\nIt is encoded in ascii.\n'
+    u'This is a test file.\nIt is encoded in ascii.\n'
     >>> reader.seek(0)        # rewind to the start.
     >>> reader.read(5)        # read at most 5 bytes.
-    'This '
+    u'This '
     >>> reader.readline()     # read to the end of the line.
-    'is a test file.\n'
+    u'is a test file.\n'
     >>> reader.seek(0)        # rewind to the start.
     >>> for line in reader:
     ...     print(repr(line))      # iterate over lines
-    'This is a test file.\n'
-    'It is encoded in ascii.\n'
+    u'This is a test file.\n'
+    u'It is encoded in ascii.\n'
     >>> reader.seek(0)        # rewind to the start.
     >>> reader.readlines()    # read a list of line strings
-    ['This is a test file.\n', 'It is encoded in ascii.\n']
+    [u'This is a test file.\n', u'It is encoded in ascii.\n']
     >>> reader.close()
 
 Size argument to ``read()``
@@ -2046,7 +2047,7 @@ characters than the ``size`` argument:
     ... """.decode('ascii').encode('utf-16'))
     >>> reader = SeekableUnicodeStreamReader(stream, 'utf-16')
     >>> reader.read(10)
-    'This '
+    u'This '
 
 If a read block ends in the middle of the byte string encoding a
 single character, then that byte string is stored in an internal
@@ -2059,7 +2060,7 @@ string, which could be mistaken for indicating the end of the file.
 
     >>> reader.seek(0)            # rewind to the start.
     >>> reader.read(1)            # we actually need to read 4 bytes
-    'T'
+    u'T'
     >>> int(reader.tell())
     4
 
@@ -2072,11 +2073,11 @@ bytes from the stream:
 
     >>> reader.seek(0)            # rewind to the start.
     >>> reader.readline()         # stores extra text in a buffer
-    'This is a test file.\n'
+    u'This is a test file.\n'
     >>> print(reader.linebuffer)   # examine the buffer contents
-    ['It is encoded i']
+    [u'It is encoded i']
     >>> reader.read(0)            # returns the contents of the buffer
-    'It is encoded i'
+    u'It is encoded i'
     >>> print(reader.linebuffer)   # examine the buffer contents
     None
 
@@ -2095,14 +2096,14 @@ returned by ``tell``.
     ... """.decode('ascii').encode('utf-16'))
     >>> reader = SeekableUnicodeStreamReader(stream, 'utf-16')
     >>> reader.read(20)
-    'This is a '
+    u'This is a '
     >>> pos = reader.tell(); print(pos)
     22
     >>> reader.read(20)
-    'test file.'
+    u'test file.'
     >>> reader.seek(pos)     # rewind to the position from tell.
     >>> reader.read(20)
-    'test file.'
+    u'test file.'
 
 The ``seek()`` and ``tell()`` methods work property even when
 ``readline()`` is used.
@@ -2113,14 +2114,14 @@ The ``seek()`` and ``tell()`` methods work property even when
     ... """.decode('ascii').encode('utf-16'))
     >>> reader = SeekableUnicodeStreamReader(stream, 'utf-16')
     >>> reader.readline()
-    'This is a test file.\n'
+    u'This is a test file.\n'
     >>> pos = reader.tell(); print(pos)
     44
     >>> reader.readline()
-    'It is encoded in utf-16.\n'
+    u'It is encoded in utf-16.\n'
     >>> reader.seek(pos)     # rewind to the position from tell.
     >>> reader.readline()
-    'It is encoded in utf-16.\n'
+    u'It is encoded in utf-16.\n'
 
 
 Squashed Bugs
index 17b011b..ce0cd83 100644 (file)
@@ -1,3 +1,4 @@
 # -*- coding: utf-8 -*-
+from __future__ import absolute_import
 
 from nltk.corpus import teardown_module
index 2894a41..011af25 100644 (file)
@@ -1,4 +1,4 @@
-.. Copyright (C) 2001-2020 NLTK Project
+.. Copyright (C) 2001-2019 NLTK Project
 .. For license information, see LICENSE.TXT
 
 Crubadan Corpus Reader
index 1fcfb29..184c512 100644 (file)
@@ -1,4 +1,4 @@
-.. Copyright (C) 2001-2020 NLTK Project
+.. Copyright (C) 2001-2019 NLTK Project
 .. For license information, see LICENSE.TXT
 
 =========================================
@@ -20,6 +20,7 @@ takes as its first argument a URL specifying what file should be
 loaded.  The ``nltk:`` protocol loads files from the NLTK data
 distribution:
 
+    >>> from __future__ import print_function
     >>> tokenizer = nltk.data.load('nltk:tokenizers/punkt/english.pickle')
     >>> tokenizer.tokenize('Hello.  This is a test.  It works!')
     ['Hello.', 'This is a test.', 'It works!']
@@ -336,7 +337,7 @@ This is mainly intended for internal use. The test simply tests that reading
 and writing work as intended and does not test how much improvement buffering
 provides.
 
-    >>> from io import StringIO
+    >>> from nltk.compat import StringIO
     >>> test = nltk.data.BufferedGzipFile('testbuf.gz', 'wb', size=2**10)
     >>> ans = []
     >>> for i in range(10000):
index 854e11a..31590c4 100755 (executable)
@@ -1,4 +1,4 @@
-.. Copyright (C) 2001-2020 NLTK Project
+.. Copyright (C) 2001-2019 NLTK Project
 .. For license information, see LICENSE.TXT
 
 ===================
index a5dabe8..df18fde 100644 (file)
@@ -1,4 +1,4 @@
-.. Copyright (C) 2001-2020 NLTK Project
+.. Copyright (C) 2001-2019 NLTK Project
 .. For license information, see LICENSE.TXT
 
 ==================
index 9a10215..d3ab46f 100644 (file)
@@ -1,4 +1,5 @@
 # -*- coding: utf-8 -*-
+from __future__ import absolute_import
 
 
 # FIXME: the entire discourse.doctest is skipped if Prover9/Mace4 is
@@ -9,6 +10,6 @@ def setup_module(module):
 
     try:
         m = Mace()
-        m._find_binary("mace4")
+        m._find_binary('mace4')
     except LookupError:
         raise SkipTest("Mace4/Prover9 is not available so discourse.doctest is skipped")
diff --git a/nlp_resource_data/nltk/test/doctest_nose_plugin.py b/nlp_resource_data/nltk/test/doctest_nose_plugin.py
new file mode 100644 (file)
index 0000000..d77210c
--- /dev/null
@@ -0,0 +1,164 @@
+# -*- coding: utf-8 -*-
+from __future__ import print_function
+import re
+import sys
+import os
+import codecs
+import doctest
+from nose.util import tolist, anyp
+from nose.plugins.base import Plugin
+from nose.suite import ContextList
+from nose.plugins.doctests import Doctest, log, DocFileCase
+
+ALLOW_UNICODE = doctest.register_optionflag('ALLOW_UNICODE')
+
+
+class _UnicodeOutputChecker(doctest.OutputChecker):
+    _literal_re = re.compile(r"(\W|^)[uU]([rR]?[\'\"])", re.UNICODE)
+
+    def _remove_u_prefixes(self, txt):
+        return re.sub(self._literal_re, r'\1\2', txt)
+
+    def check_output(self, want, got, optionflags):
+        res = doctest.OutputChecker.check_output(self, want, got, optionflags)
+        if res:
+            return True
+        if not (optionflags & ALLOW_UNICODE):
+            return False
+
+        # ALLOW_UNICODE is active and want != got
+        cleaned_want = self._remove_u_prefixes(want)
+        cleaned_got = self._remove_u_prefixes(got)
+        res = doctest.OutputChecker.check_output(
+            self, cleaned_want, cleaned_got, optionflags
+        )
+        return res
+
+
+_checker = _UnicodeOutputChecker()
+
+
+class DoctestPluginHelper(object):
+    """
+    This mixin adds print_function future import to all test cases.
+
+    It also adds support for:
+        '#doctest +ALLOW_UNICODE' option that
+        makes DocTestCase think u'foo' == 'foo'.
+
+        '#doctest doctestencoding=utf-8' option that
+        changes the encoding of doctest files
+    """
+
+    OPTION_BY_NAME = ('doctestencoding',)
+
+    def loadTestsFromFileUnicode(self, filename):
+        if self.extension and anyp(filename.endswith, self.extension):
+            name = os.path.basename(filename)
+            dh = codecs.open(filename, 'r', self.options.get('doctestencoding'))
+            try:
+                doc = dh.read()
+            finally:
+                dh.close()
+
+            fixture_context = None
+            globs = {'__file__': filename}
+            if self.fixtures:
+                base, ext = os.path.splitext(name)
+                dirname = os.path.dirname(filename)
+                sys.path.append(dirname)
+                fixt_mod = base + self.fixtures
+                try:
+                    fixture_context = __import__(fixt_mod, globals(), locals(), ["nop"])
+                except ImportError as e:
+                    log.debug("Could not import %s: %s (%s)", fixt_mod, e, sys.path)
+                log.debug("Fixture module %s resolved to %s", fixt_mod, fixture_context)
+                if hasattr(fixture_context, 'globs'):
+                    globs = fixture_context.globs(globs)
+            parser = doctest.DocTestParser()
+            test = parser.get_doctest(
+                doc, globs=globs, name=name, filename=filename, lineno=0
+            )
+            if test.examples:
+                case = DocFileCase(
+                    test,
+                    optionflags=self.optionflags,
+                    setUp=getattr(fixture_context, 'setup_test', None),
+                    tearDown=getattr(fixture_context, 'teardown_test', None),
+                    result_var=self.doctest_result_var,
+                )
+                if fixture_context:
+                    yield ContextList((case,), context=fixture_context)
+                else:
+                    yield case
+            else:
+                yield False  # no tests to load
+
+    def loadTestsFromFile(self, filename):
+
+        cases = self.loadTestsFromFileUnicode(filename)
+
+        for case in cases:
+            if isinstance(case, ContextList):
+                yield ContextList([self._patchTestCase(c) for c in case], case.context)
+            else:
+                yield self._patchTestCase(case)
+
+    def loadTestsFromModule(self, module):
+        """Load doctests from the module.
+        """
+        for suite in super(DoctestPluginHelper, self).loadTestsFromModule(module):
+            cases = [self._patchTestCase(case) for case in suite._get_tests()]
+            yield self.suiteClass(cases, context=module, can_split=False)
+
+    def _patchTestCase(self, case):
+        if case:
+            case._dt_test.globs['print_function'] = print_function
+            case._dt_checker = _checker
+        return case
+
+    def configure(self, options, config):
+        # it is overriden in order to fix doctest options discovery
+
+        Plugin.configure(self, options, config)
+        self.doctest_result_var = options.doctest_result_var
+        self.doctest_tests = options.doctest_tests
+        self.extension = tolist(options.doctestExtension)
+        self.fixtures = options.doctestFixtures
+        self.finder = doctest.DocTestFinder()
+
+        # super(DoctestPluginHelper, self).configure(options, config)
+        self.optionflags = 0
+        self.options = {}
+
+        if options.doctestOptions:
+            stroptions = ",".join(options.doctestOptions).split(',')
+            for stroption in stroptions:
+                try:
+                    if stroption.startswith('+'):
+                        self.optionflags |= doctest.OPTIONFLAGS_BY_NAME[stroption[1:]]
+                        continue
+                    elif stroption.startswith('-'):
+                        self.optionflags &= ~doctest.OPTIONFLAGS_BY_NAME[stroption[1:]]
+                        continue
+                    try:
+                        key, value = stroption.split('=')
+                    except ValueError:
+                        pass
+                    else:
+                        if not key in self.OPTION_BY_NAME:
+                            raise ValueError()
+                        self.options[key] = value
+                        continue
+                except (AttributeError, ValueError, KeyError):
+                    raise ValueError("Unknown doctest option {}".format(stroption))
+                else:
+                    raise ValueError(
+                        "Doctest option is not a flag or a key/value pair: {} ".format(
+                            stroption
+                        )
+                    )
+
+
+class DoctestFix(DoctestPluginHelper, Doctest):
+    pass
index a0cd1f3..6163052 100644 (file)
@@ -1,4 +1,4 @@
-.. Copyright (C) 2001-2020 NLTK Project
+.. Copyright (C) 2001-2019 NLTK Project
 .. For license information, see LICENSE.TXT
 
 ================================
index b866978..a1775f8 100644 (file)
@@ -1,4 +1,4 @@
-.. Copyright (C) 2001-2020 NLTK Project
+.. Copyright (C) 2001-2019 NLTK Project
 .. For license information, see LICENSE.TXT
 
 =========================
@@ -9,6 +9,7 @@
 
 Grammars can be parsed from strings.
 
+    >>> from __future__ import print_function
     >>> import nltk
     >>> from nltk import grammar, parse
     >>> g = """
index 0c14435..8c35dad 100644 (file)
@@ -1,9 +1,10 @@
-.. Copyright (C) 2001-2020 NLTK Project
+.. Copyright (C) 2001-2019 NLTK Project
 .. For license information, see LICENSE.TXT
 
 ==================================
  Feature Structures & Unification
 ==================================
+    >>> from __future__ import print_function
     >>> from nltk.featstruct import FeatStruct
     >>> from nltk.sem.logic import Variable, VariableExpression, Expression
 
index d1ecc80..6de3a41 100644 (file)
@@ -1,4 +1,4 @@
-.. Copyright (C) 2001-2020 NLTK Project
+.. Copyright (C) 2001-2019 NLTK Project
 .. For license information, see LICENSE.TXT
 
 ========
index 2c7f3d8..4453518 100644 (file)
@@ -1,4 +1,4 @@
-.. Copyright (C) 2001-2020 NLTK Project
+.. Copyright (C) 2001-2019 NLTK Project
 .. For license information, see LICENSE.TXT
 
 ===============================================
index 386e3e0..2e27597 100644 (file)
@@ -1,4 +1,4 @@
-.. Copyright (C) 2001-2020 NLTK Project
+.. Copyright (C) 2001-2019 NLTK Project
 .. For license information, see LICENSE.TXT
 
 =======================================
@@ -60,7 +60,7 @@ Each word is represented in the space of 300 dimensions:
 Finding the top n words that are similar to a target word is simple. The result is the list of n words with the score.
 
     >>> model.most_similar(positive=['university'], topn = 3)
-    [('universities', 0.70039...), ('faculty', 0.67809...), ('undergraduate', 0.65870...)]
+    [(u'universities', 0.70039...), (u'faculty', 0.67809...), (u'undergraduate', 0.65870...)]
 
 Finding a word that is not in a list is also supported, although, implementing this by yourself is simple.
 
@@ -71,10 +71,10 @@ Mikolov et al. (2013) figured out that word embedding captures much of syntactic
 the vector 'King - Man + Woman' is close to 'Queen' and 'Germany - Berlin + Paris' is close to 'France'.
 
     >>> model.most_similar(positive=['woman','king'], negative=['man'], topn = 1)
-    [('queen', 0.71181...)]
+    [(u'queen', 0.71181...)]
 
     >>> model.most_similar(positive=['Paris','Germany'], negative=['Berlin'], topn = 1)
-    [('France', 0.78840...)]
+    [(u'France', 0.78840...)]
 
 We can visualize the word embeddings using t-SNE (http://lvdmaaten.github.io/tsne/). For this demonstration, we visualize the first 1000 words.
 
index 2de144c..b1a6d2e 100644 (file)
@@ -1,4 +1,5 @@
 # -*- coding: utf-8 -*-
+from __future__ import absolute_import
 
 
 def setup_module(module):
index 08b96e3..7bf29a0 100644 (file)
@@ -1,4 +1,4 @@
-.. Copyright (C) 2001-2020 NLTK Project
+.. Copyright (C) 2001-2019 NLTK Project
 .. For license information, see LICENSE.TXT
 
 ==============================================================================
index a76e96f..1329794 100644 (file)
@@ -1,4 +1,4 @@
-.. Copyright (C) 2001-2020 NLTK Project
+.. Copyright (C) 2001-2019 NLTK Project
 .. For license information, see LICENSE.TXT
 
 .. see also: gluesemantics.doctest
index 1a7fee3..70e149a 100644 (file)
@@ -1,4 +1,5 @@
 # -*- coding: utf-8 -*-
+from __future__ import absolute_import
 
 
 def setup_module(module):
@@ -6,6 +7,6 @@ def setup_module(module):
     from nltk.parse.malt import MaltParser
 
     try:
-        depparser = MaltParser("maltparser-1.7.2")
+        depparser = MaltParser('maltparser-1.7.2')
     except LookupError:
         raise SkipTest("MaltParser is not available")
index c604069..7cae9d9 100644 (file)
@@ -1,4 +1,4 @@
-.. Copyright (C) 2001-2020 NLTK Project
+.. Copyright (C) 2001-2019 NLTK Project
 .. For license information, see LICENSE.TXT
 
 ===============
index 2eab462..4221537 100644 (file)
@@ -1,4 +1,4 @@
-.. Copyright (C) 2001-2020 NLTK Project
+.. Copyright (C) 2001-2019 NLTK Project
 .. For license information, see LICENSE.TXT
 
 ==========================
index 31b46c7..7ce8167 100644 (file)
@@ -1,4 +1,4 @@
-.. Copyright (C) 2001-2020 NLTK Project
+.. Copyright (C) 2001-2019 NLTK Project
 .. For license information, see LICENSE.TXT
 
 .. _align howto: align.html
index 5bf1501..c2a41a3 100644 (file)
@@ -1,4 +1,4 @@
-.. Copyright (C) 2001-2020 NLTK Project
+.. Copyright (C) 2001-2019 NLTK Project
 .. For license information, see LICENSE.TXT
 
 ====================================
index 5103cd9..3fe9d03 100644 (file)
@@ -1,4 +1,5 @@
 # -*- coding: utf-8 -*-
+from __future__ import absolute_import
 
 
 def setup_module(module):
@@ -7,7 +8,7 @@ def setup_module(module):
 
     try:
         m = Mace()
-        m._find_binary("mace4")
+        m._find_binary('mace4')
     except LookupError:
         raise SkipTest(
             "Mace4/Prover9 is not available so inference.doctest was skipped"
index f906203..74c2bd9 100644 (file)
@@ -1,4 +1,4 @@
-.. Copyright (C) 2001-2020 NLTK Project
+.. Copyright (C) 2001-2019 NLTK Project
 .. For license information, see LICENSE.TXT
 
 ==========================================
index f82af81..181b080 100644 (file)
@@ -1,4 +1,4 @@
-.. Copyright (C) 2001-2020 NLTK Project
+.. Copyright (C) 2001-2019 NLTK Project
 .. For license information, see LICENSE.TXT
 
 ============================
index c2a97c8..f3bde33 100644 (file)
@@ -1,4 +1,4 @@
-.. Copyright (C) 2001-2020 NLTK Project
+.. Copyright (C) 2001-2019 NLTK Project
 .. For license information, see LICENSE.TXT
 
 .. -*- coding: utf-8 -*-
index 45c6429..ab27009 100644 (file)
@@ -1,4 +1,4 @@
-.. Copyright (C) 2001-2020 NLTK Project
+.. Copyright (C) 2001-2019 NLTK Project
 .. For license information, see LICENSE.TXT
 
 =======================
diff --git a/nlp_resource_data/nltk/test/meteor.doctest b/nlp_resource_data/nltk/test/meteor.doctest
deleted file mode 100644 (file)
index 7544d25..0000000
+++ /dev/null
@@ -1,45 +0,0 @@
-.. Copyright (C) 2001-2020 NLTK Project
-.. For license information, see LICENSE.TXT
-
-.. -*- coding: utf-8 -*-
-
-=============
-METEOR tests
-=============
-
-No Allignment test
-------------------
-
-    >>> from nltk.translate import meteor
-
-If the candidate has no alignment to any of the references, the METEOR score is 0.
-
-    >>> round(meteor(
-    ...     ['The candidate has no alignment to any of the references'],
-    ...     'John loves Mary'
-    ... ),4)
-    0.0
-
-Tests based on wikipedia examples
----------------------------------
-
-Testing on `wikipedia examples <https://en.wikipedia.org/wiki/METEOR#Examples>`_
-
-    >>> same_res = round(meteor(
-    ...       ['The cat sat on the mat'], 
-    ...       'The cat sat on the mat'
-    ...       ),4)
-    >>> abs(same_res - 0.9977) < 1e-2
-    True
-
-    >>> meteor(
-    ...       ['The cat sat on the mat'], 
-    ...       'on the mat sat the cat'
-    ...       )
-    0.5
-
-    >>> round(meteor(
-    ...       ['The cat sat on the mat'], 
-    ...       'The cat was sat on the mat'
-    ...       ),4)
-    0.9654
index 5ff9877..139a888 100644 (file)
@@ -1,4 +1,4 @@
-.. Copyright (C) 2001-2020 NLTK Project
+.. Copyright (C) 2001-2019 NLTK Project
 .. For license information, see LICENSE.TXT
 
 =======
@@ -8,6 +8,7 @@ Metrics
 The `nltk.metrics` package provides a variety of *evaluation measures*
 which can be used for a wide variety of NLP tasks.
 
+   >>> from __future__ import print_function
    >>> from nltk.metrics import *
 
 ------------------
@@ -51,14 +52,6 @@ String edit distance (Levenshtein):
 
     >>> edit_distance("rain", "shine")
     3
-    >>> edit_distance_align("shine", "shine")
-    [(0, 0), (1, 1), (2, 2), (3, 3), (4, 4), (5, 5)]
-    >>> edit_distance_align("rain", "brainy")
-    [(0, 0), (1, 1), (1, 2), (2, 3), (3, 4), (4, 5), (4, 6)]
-    >>> edit_distance_align("", "brainy")
-    [(0, 0), (0, 1), (0, 2), (0, 3), (0, 4), (0, 5), (0, 6)]
-    >>> edit_distance_align("", "")
-    [(0, 0)]
 
 Other distance measures:
 
@@ -275,18 +268,3 @@ For trigrams, we have to provide more count information:
     True
     >>> tam.jaccard(n_w1_w2_w3, pair_counts, uni_counts2, N) > tam.jaccard(n_w1_w2_w3, pair_counts, uni_counts, N)
     True
-
-
-For fourgrams, we have to provide more count information:
-
-    >>> n_w1_w2_w3_w4 = 5
-    >>> n_w1_w2, n_w1_w3, n_w2_w3 = 35, 60, 40
-    >>> n_w1_w2_w3, n_w2_w3_w4 = 20, 10 
-    >>> pair_counts = (n_w1_w2, n_w1_w3, n_w2_w3)
-    >>> triplet_counts = (n_w1_w2_w3, n_w2_w3_w4)
-    >>> n_w1, n_w2, n_w3, n_w4 = 100, 200, 300, 400
-    >>> uni_counts = (n_w1, n_w2, n_w3, n_w4)
-    >>> N = 14307668
-    >>> qam = QuadgramAssocMeasures
-    >>> qam.raw_freq(n_w1_w2_w3_w4, pair_counts, triplet_counts, uni_counts, N) == 1. * n_w1_w2_w3_w4 / N
-    True
index d72e0b3..71343b3 100644 (file)
@@ -1,4 +1,4 @@
-.. Copyright (C) 2001-2020 NLTK Project
+.. Copyright (C) 2001-2019 NLTK Project
 .. For license information, see LICENSE.TXT
 
 --------------------------------------------------------------------------------
index ea17c60..be761b3 100644 (file)
@@ -1,4 +1,4 @@
-.. Copyright (C) 2001-2020 NLTK Project
+.. Copyright (C) 2001-2019 NLTK Project
 .. For license information, see LICENSE.TXT
 
 ======================
index e6bdffa..0c38381 100644 (file)
@@ -1,4 +1,5 @@
 # -*- coding: utf-8 -*-
+from __future__ import absolute_import
 
 
 def setup_module(module):
@@ -7,7 +8,7 @@ def setup_module(module):
 
     try:
         m = Mace()
-        m._find_binary("mace4")
+        m._find_binary('mace4')
     except LookupError:
         raise SkipTest(
             "Mace4/Prover9 is not available so nonmonotonic.doctest was skipped"
index c84b469..b7c0ee1 100644 (file)
@@ -1,4 +1,4 @@
-.. Copyright (C) 2001-2020 NLTK Project
+.. Copyright (C) 2001-2019 NLTK Project
 .. For license information, see LICENSE.TXT
 
 =========
index 84cee4a..87051c9 100644 (file)
@@ -1,4 +1,4 @@
-.. Copyright (C) 2001-2020 NLTK Project
+.. Copyright (C) 2001-2019 NLTK Project
 .. For license information, see LICENSE.TXT
 
 ==================================
@@ -140,8 +140,8 @@ We just saw a ``for`` loop above.  Another useful control structure is a
     >>> [w for w in psent1 if w.endswith('a')]
     ['da', 'gl\xf3ria', 'era', 'a', 'coisa', 'humana', 'a', 'sua', 'genu\xedna']
     >>> [w for w in ptext4 if len(w) > 15]
-    ['norte-irlandeses', 'pan-nacionalismo', 'predominatemente', 'primeiro-ministro',
-    'primeiro-ministro', 'irlandesa-americana', 'responsabilidades', 'significativamente']
+    [u'norte-irlandeses', u'pan-nacionalismo', u'predominatemente', u'primeiro-ministro',
+    u'primeiro-ministro', u'irlandesa-americana', u'responsabilidades', u'significativamente']
 
 We can examine the relative frequency of words in a text, using ``FreqDist``:
 
@@ -151,19 +151,19 @@ We can examine the relative frequency of words in a text, using ``FreqDist``:
     >>> fd1['olhos']
     137
     >>> fd1.max()
-    ','
+    u','
     >>> fd1.samples()[:100]
-    [',', '.', 'a', 'que', 'de', 'e', '-', 'o', ';', 'me', 'um', 'n\xe3o',
-    '\x97', 'se', 'do', 'da', 'uma', 'com', 'os', '\xe9', 'era', 'as', 'eu',
-    'lhe', 'ao', 'em', 'para', 'mas', '...', '!', '\xe0', 'na', 'mais', '?',
-    'no', 'como', 'por', 'N\xe3o', 'dos', 'o', 'ele', ':', 'Virg\xedlia',
-    'me', 'disse', 'minha', 'das', 'O', '/', 'A', 'CAP\xcdTULO', 'muito',
-    'depois', 'coisa', 'foi', 'sem', 'olhos', 'ela', 'nos', 'tinha', 'nem',
-    'E', 'outro', 'vida', 'nada', 'tempo', 'menos', 'outra', 'casa', 'homem',
-    'porque', 'quando', 'mim', 'mesmo', 'ser', 'pouco', 'estava', 'dia',
-    't\xe3o', 'tudo', 'Mas', 'at\xe9', 'D', 'ainda', 's\xf3', 'alguma',
-    'la', 'vez', 'anos', 'h\xe1', 'Era', 'pai', 'esse', 'lo', 'dizer', 'assim',
-    'ent\xe3o', 'dizia', 'aos', 'Borba']
+    [u',', u'.', u'a', u'que', u'de', u'e', u'-', u'o', u';', u'me', u'um', u'n\xe3o',
+    u'\x97', u'se', u'do', u'da', u'uma', u'com', u'os', u'\xe9', u'era', u'as', u'eu',
+    u'lhe', u'ao', u'em', u'para', u'mas', u'...', u'!', u'\xe0', u'na', u'mais', u'?',
+    u'no', u'como', u'por', u'N\xe3o', u'dos', u'ou', u'ele', u':', u'Virg\xedlia',
+    u'meu', u'disse', u'minha', u'das', u'O', u'/', u'A', u'CAP\xcdTULO', u'muito',
+    u'depois', u'coisa', u'foi', u'sem', u'olhos', u'ela', u'nos', u'tinha', u'nem',
+    u'E', u'outro', u'vida', u'nada', u'tempo', u'menos', u'outra', u'casa', u'homem',
+    u'porque', u'quando', u'mim', u'mesmo', u'ser', u'pouco', u'estava', u'dia',
+    u't\xe3o', u'tudo', u'Mas', u'at\xe9', u'D', u'ainda', u's\xf3', u'alguma',
+    u'la', u'vez', u'anos', u'h\xe1', u'Era', u'pai', u'esse', u'lo', u'dizer', u'assim',
+    u'ent\xe3o', u'dizia', u'aos', u'Borba']
 
 ---------------
 Reading Corpora
@@ -244,7 +244,7 @@ We can access this corpus as a sequence of words or tagged words as follows:
     [['Jersei', 'atinge', 'm\xe9dia', 'de', 'Cr$', '1,4', 'milh\xe3o',
     'em', 'a', 'venda', 'de', 'a', 'Pinhal', 'em', 'S\xe3o', 'Paulo'],
     ['Programe', 'sua', 'viagem', 'a', 'a', 'Exposi\xe7\xe3o', 'Nacional',
-    'do', 'Zeb', ',', 'que', 'come\xe7a', 'dia', '25'], ...]
+    'do', 'Zebu', ',', 'que', 'come\xe7a', 'dia', '25'], ...]
     >>> nltk.corpus.mac_morpho.tagged_words()
     [('Jersei', 'N'), ('atinge', 'V'), ('m\xe9dia', 'N'), ...]
 
@@ -258,7 +258,7 @@ We can also access it in sentence chunks.
       ('Paulo', 'NPROP')],
      [('Programe', 'V'), ('sua', 'PROADJ'), ('viagem', 'N'), ('a', 'PREP|+'),
       ('a', 'ART'), ('Exposi\xe7\xe3o', 'NPROP'), ('Nacional', 'NPROP'),
-      ('do', 'NPROP'), ('Zeb', 'NPROP'), (',', ','), ('que', 'PRO-KS-REL'),
+      ('do', 'NPROP'), ('Zebu', 'NPROP'), (',', ','), ('que', 'PRO-KS-REL'),
       ('come\xe7a', 'V'), ('dia', 'N'), ('25', 'N|AP')], ...]
 
 This data can be used to train taggers (examples below for the Floresta treebank).
index f417bc6..afbd59e 100644 (file)
@@ -1,4 +1,7 @@
 # -*- coding: utf-8 -*-
+from __future__ import absolute_import
+from nltk.compat import PY3
+
 from nltk.corpus import teardown_module
 
 
@@ -8,3 +11,8 @@ def setup_module(module):
     raise SkipTest(
         "portuguese_en.doctest imports nltk.examples.pt which doesn't exist!"
     )
+
+    if not PY3:
+        raise SkipTest(
+            "portuguese_en.doctest was skipped because non-ascii doctests are not supported under Python 2.x"
+        )
index ea36fe3..3cb582e 100644 (file)
@@ -1,4 +1,4 @@
-.. Copyright (C) 2001-2020 NLTK Project
+.. Copyright (C) 2001-2019 NLTK Project
 .. For license information, see LICENSE.TXT
 
 ===========
index fc786c9..680dab6 100644 (file)
@@ -1,4 +1,5 @@
 # -*- coding: utf-8 -*-
+from __future__ import absolute_import
 
 
 # probability.doctest uses HMM which requires numpy;
index d3e8a68..9bec607 100644 (file)
@@ -1,4 +1,4 @@
-.. Copyright (C) 2001-2020 NLTK Project
+.. Copyright (C) 2001-2019 NLTK Project
 .. For license information, see LICENSE.TXT
 
 ========
index 6df3c1c..085fa90 100644 (file)
@@ -1,4 +1,4 @@
-.. Copyright (C) 2001-2020 NLTK Project
+.. Copyright (C) 2001-2019 NLTK Project
 .. For license information, see LICENSE.TXT
 
 ======================
@@ -65,24 +65,24 @@ this case, the strings are also POS tagged.
     >>> from nltk.corpus import conll2002
     >>> for doc in conll2002.chunked_sents('ned.train')[27]:
     ...     print(doc)
-    ('Het', 'Art')
+    (u'Het', u'Art')
     (ORG Hof/N van/Prep Cassatie/N)
-    ('verbrak', 'V')
-    ('het', 'Art')
-    ('arrest', 'N')
-    ('zodat', 'Conj')
-    ('het', 'Pron')
-    ('moest', 'V')
-    ('worden', 'V')
-    ('overgedaan', 'V')
-    ('door', 'Prep')
-    ('het', 'Art')
-    ('hof', 'N')
-    ('van', 'Prep')
-    ('beroep', 'N')
-    ('van', 'Prep')
+    (u'verbrak', u'V')
+    (u'het', u'Art')
+    (u'arrest', u'N')
+    (u'zodat', u'Conj')
+    (u'het', u'Pron')
+    (u'moest', u'V')
+    (u'worden', u'V')
+    (u'overgedaan', u'V')
+    (u'door', u'Prep')
+    (u'het', u'Art')
+    (u'hof', u'N')
+    (u'van', u'Prep')
+    (u'beroep', u'N')
+    (u'van', u'Prep')
     (LOC Antwerpen/N)
-    ('.', 'Punc')
+    (u'.', u'Punc')
 
 Relation Extraction
 ~~~~~~~~~~~~~~~~~~~
@@ -234,16 +234,16 @@ presented as something that looks more like a clause in a logical language.
     ...         for rel in relextract.extract_rels('ORG', 'LOC', doc, corpus='conll2002', pattern = DE)]
     >>> for r in rels[:10]:
     ...     print(relextract.clause(r, relsym='DE'))    # doctest: +NORMALIZE_WHITESPACE
-    DE('tribunal_supremo', 'victoria')
-    DE('museo_de_arte', 'alcorc\xf3n')
-    DE('museo_de_bellas_artes', 'a_coru\xf1a')
-    DE('siria', 'l\xedbano')
-    DE('uni\xf3n_europea', 'pek\xedn')
-    DE('ej\xe9rcito', 'rogberi')
-    DE('juzgado_de_instrucci\xf3n_n\xfamero_1', 'san_sebasti\xe1n')
-    DE('psoe', 'villanueva_de_la_serena')
-    DE('ej\xe9rcito', 'l\xedbano')
-    DE('juzgado_de_lo_penal_n\xfamero_2', 'ceuta')
+    DE(u'tribunal_supremo', u'victoria')
+    DE(u'museo_de_arte', u'alcorc\xf3n')
+    DE(u'museo_de_bellas_artes', u'a_coru\xf1a')
+    DE(u'siria', u'l\xedbano')
+    DE(u'uni\xf3n_europea', u'pek\xedn')
+    DE(u'ej\xe9rcito', u'rogberi')
+    DE(u'juzgado_de_instrucci\xf3n_n\xfamero_1', u'san_sebasti\xe1n')
+    DE(u'psoe', u'villanueva_de_la_serena')
+    DE(u'ej\xe9rcito', u'l\xedbano')
+    DE(u'juzgado_de_lo_penal_n\xfamero_2', u'ceuta')
     >>> vnv = """
     ... (
     ... is/V|
@@ -258,6 +258,6 @@ presented as something that looks more like a clause in a logical language.
     >>> for doc in conll2002.chunked_sents('ned.train'):
     ...     for r in relextract.extract_rels('PER', 'ORG', doc, corpus='conll2002', pattern=VAN):
     ...         print(relextract.clause(r, relsym="VAN"))
-    VAN("cornet_d'elzius", 'buitenlandse_handel')
-    VAN('johan_rottiers', 'kardinaal_van_roey_instituut')
-    VAN('annie_lennox', 'eurythmics')
+    VAN(u"cornet_d'elzius", u'buitenlandse_handel')
+    VAN(u'johan_rottiers', u'kardinaal_van_roey_instituut')
+    VAN(u'annie_lennox', u'eurythmics')
index fc31db4..318efcd 100644 (file)
@@ -1,4 +1,4 @@
-.. Copyright (C) 2001-2020 NLTK Project
+.. Copyright (C) 2001-2019 NLTK Project
 .. For license information, see LICENSE.TXT
 
 =========================
index 9dc06ec..8f40cc6 100644 (file)
@@ -1,5 +1,6 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
+from __future__ import absolute_import, print_function
 import sys
 import os
 import nose
@@ -7,13 +8,15 @@ from nose.plugins.manager import PluginManager
 from nose.plugins.doctests import Doctest
 from nose.plugins import builtin
 
-NLTK_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))
+NLTK_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..'))
 sys.path.insert(0, NLTK_ROOT)
 
-NLTK_TEST_DIR = os.path.join(NLTK_ROOT, "nltk")
+NLTK_TEST_DIR = os.path.join(NLTK_ROOT, 'nltk')
 
-if __name__ == "__main__":
+if __name__ == '__main__':
     # there shouldn't be import from NLTK for coverage to work properly
+    from doctest_nose_plugin import DoctestFix
+
     try:
         # Import RedNose plugin for colored test output
         from rednose import RedNose
@@ -30,7 +33,9 @@ if __name__ == "__main__":
 
         def loadPlugins(self):
             for plug in builtin.plugins:
-                self.addPlugin(plug())
+                if plug != Doctest:
+                    self.addPlugin(plug())
+            self.addPlugin(DoctestFix())
             if rednose_available:
                 self.addPlugin(RedNose())
 
@@ -50,25 +55,26 @@ if __name__ == "__main__":
     if not args:
         args = [NLTK_TEST_DIR]
 
-    if all(arg.startswith("-") for arg in args):
+    if all(arg.startswith('-') for arg in args):
         # only extra options were passed
         args += [NLTK_TEST_DIR]
 
     # Activate RedNose and hide skipped test messages from output
     if rednose_available:
-        args += ["--rednose", "--hide-skips"]
+        args += ['--rednose', '--hide-skips']
 
     arguments = [
-        "--exclude=",  # why is this needed?
+        '--exclude=',  # why is this needed?
         # '--with-xunit',
         # '--xunit-file=$WORKSPACE/nosetests.xml',
         # '--nocapture',
-        "--with-doctest",
+        '--with-doctest',
         # '--doctest-tests',
         # '--debug=nose,nose.importer,nose.inspector,nose.plugins,nose.result,nose.selector',
-        "--doctest-extension=.doctest",
-        "--doctest-fixtures=_fixt",
-        "--doctest-options=+ELLIPSIS,+NORMALIZE_WHITESPACE,+IGNORE_EXCEPTION_DETAIL",
+        '--doctest-extension=.doctest',
+        '--doctest-fixtures=_fixt',
+        '--doctest-options=+ELLIPSIS,+NORMALIZE_WHITESPACE,+IGNORE_EXCEPTION_DETAIL,+ALLOW_UNICODE,'
+        'doctestencoding=utf-8',
         # '--verbosity=3',
     ] + args
 
index 82918ba..bb8a7cf 100644 (file)
@@ -1,4 +1,5 @@
 # -*- coding: utf-8 -*-
+from __future__ import absolute_import
 
 
 # skip segmentation.doctest if numpy is not available
index 32c0f84..f1a1f3c 100644 (file)
@@ -1,4 +1,4 @@
-.. Copyright (C) 2001-2020 NLTK Project
+.. Copyright (C) 2001-2019 NLTK Project
 .. For license information, see LICENSE.TXT
 
 =========
index 8d67144..135180d 100644 (file)
@@ -1,4 +1,5 @@
 # -*- coding: utf-8 -*-
+from __future__ import absolute_import
 
 # reset the variables counter before running tests
 def setup_module(module):
index 36e5b20..359e165 100644 (file)
@@ -1,4 +1,4 @@
-.. Copyright (C) 2001-2020 NLTK Project
+.. Copyright (C) 2001-2019 NLTK Project
 .. For license information, see LICENSE.TXT
 
 ===================
index 70f25ee..46126bb 100644 (file)
@@ -1,4 +1,4 @@
-.. Copyright (C) 2001-2020 NLTK Project
+.. Copyright (C) 2001-2019 NLTK Project
 .. For license information, see LICENSE.TXT
 
 ======================
index 48fdcd3..5636163 100644 (file)
@@ -1,4 +1,4 @@
-.. Copyright (C) 2001-2020 NLTK Project
+.. Copyright (C) 2001-2019 NLTK Project
 .. For license information, see LICENSE.TXT
 
 =================
@@ -8,7 +8,8 @@ EasyInstall Tests
 This file contains some simple tests that will be run by EasyInstall in
 order to test the installation when NLTK-Data is absent.
 
+    >>> from __future__ import print_function
+
 ------------
 Tokenization
 ------------
index b80104d..2cf9857 100644 (file)
@@ -1,4 +1,4 @@
-.. Copyright (C) 2001-2020 NLTK Project
+.. Copyright (C) 2001-2019 NLTK Project
 .. For license information, see LICENSE.TXT
 
 ==========
@@ -11,6 +11,7 @@ Overview
 Stemmers remove morphological affixes from words, leaving only the
 word stem.
 
+    >>> from __future__ import print_function
     >>> from nltk.stem import *
 
 Unit tests for the Porter stemmer
index 7103b41..2248cba 100644 (file)
@@ -1,4 +1,4 @@
-.. Copyright (C) 2001-2020 NLTK Project
+.. Copyright (C) 2001-2019 NLTK Project
 .. For license information, see LICENSE.TXT
 
 Regression Tests
index a3a7dfa..f99e22a 100644 (file)
@@ -1,6 +1,7 @@
-.. Copyright (C) 2001-2020 NLTK Project
+.. Copyright (C) 2001-2019 NLTK Project
 .. For license information, see LICENSE.TXT
 
+    >>> from __future__ import print_function
     >>> from nltk.tokenize import *
 
 Regression Tests: Treebank Tokenizer
@@ -42,12 +43,12 @@ Some test strings.
 
 Testing improvement made to the TreebankWordTokenizer
 
-    >>> sx1 = '\xabNow that I can do.\xbb'
-    >>> expected = ['\xab', 'Now', 'that', 'I', 'can', 'do', '.', '\xbb']
+    >>> sx1 = u'\xabNow that I can do.\xbb'
+    >>> expected = [u'\xab', u'Now', u'that', u'I', u'can', u'do', u'.', u'\xbb']
     >>> word_tokenize(sx1) == expected
     True
-    >>> sx2 = 'The unicode 201C and 201D \u201cLEFT(RIGHT) DOUBLE QUOTATION MARK\u201d is also OPEN_PUNCT and CLOSE_PUNCT.'
-    >>> expected = ['The', 'unicode', '201C', 'and', '201D', '\u201c', 'LEFT', '(', 'RIGHT', ')', 'DOUBLE', 'QUOTATION', 'MARK', '\u201d', 'is', 'also', 'OPEN_PUNCT', 'and', 'CLOSE_PUNCT', '.']
+    >>> sx2 = u'The unicode 201C and 201D \u201cLEFT(RIGHT) DOUBLE QUOTATION MARK\u201d is also OPEN_PUNCT and CLOSE_PUNCT.'
+    >>> expected = [u'The', u'unicode', u'201C', u'and', u'201D', u'\u201c', u'LEFT', u'(', u'RIGHT', u')', u'DOUBLE', u'QUOTATION', u'MARK', u'\u201d', u'is', u'also', u'OPEN_PUNCT', u'and', u'CLOSE_PUNCT', u'.']
     >>> word_tokenize(sx2) == expected
     True
 
@@ -175,7 +176,7 @@ It should not hang on long sequences of the same punctuation character.
     >>> tknzr = TweetTokenizer()
     >>> s10 = "Photo: Aujourd'hui sur http://t.co/0gebOFDUzn Projet... http://t.co/bKfIUbydz2.............................. http://fb.me/3b6uXpz0L"
     >>> tknzr.tokenize(s10)
-    ['Photo', ':', "Aujourd'hui", 'sur', 'http://t.co/0gebOFDUzn', 'Projet', '...', 'http://t.co/bKfIUbydz2', '...', 'http://fb.me/3b6uXpz0L']
+    [u'Photo', u':', u"Aujourd'hui", u'sur', u'http://t.co/0gebOFDUzn', u'Projet', u'...', u'http://t.co/bKfIUbydz2', u'...', u'http://fb.me/3b6uXpz0L']
 
 
 Regression Tests: PunktSentenceTokenizer
index 1e430ad..1abf684 100644 (file)
@@ -1,4 +1,4 @@
-.. Copyright (C) 2001-2020 NLTK Project
+.. Copyright (C) 2001-2019 NLTK Project
 .. For license information, see LICENSE.TXT
 
 ===============================
index 87966fb..6a1bb70 100644 (file)
@@ -1,4 +1,4 @@
-.. Copyright (C) 2001-2020 NLTK Project
+.. Copyright (C) 2001-2019 NLTK Project
 .. For license information, see LICENSE.TXT
 
 .. -*- coding: utf-8 -*-
index 17b011b..ce0cd83 100644 (file)
@@ -1,3 +1,4 @@
 # -*- coding: utf-8 -*-
+from __future__ import absolute_import
 
 from nltk.corpus import teardown_module
index 9389417..a4b93ed 100644 (file)
@@ -1,4 +1,4 @@
-.. Copyright (C) 2001-2020 NLTK Project
+.. Copyright (C) 2001-2019 NLTK Project
 .. For license information, see LICENSE.TXT
 
 ===============================
index 3c129c7..8302c2c 100644 (file)
@@ -1,4 +1,4 @@
-.. Copyright (C) 2001-2020 NLTK Project
+.. Copyright (C) 2001-2019 NLTK Project
 .. For license information, see LICENSE.TXT
 
 ========================================================
index 973c27d..e44e504 100644 (file)
@@ -1,4 +1,4 @@
-.. Copyright (C) 2001-2020 NLTK Project
+.. Copyright (C) 2001-2019 NLTK Project
 .. For license information, see LICENSE.TXT
 
 -------------------------------------------
index 76578be..2b18bf3 100644 (file)
Binary files a/nlp_resource_data/nltk/test/unit/__pycache__/__init__.cpython-37.pyc and b/nlp_resource_data/nltk/test/unit/__pycache__/__init__.cpython-37.pyc differ
diff --git a/nlp_resource_data/nltk/test/unit/__pycache__/test_2x_compat.cpython-37.pyc b/nlp_resource_data/nltk/test/unit/__pycache__/test_2x_compat.cpython-37.pyc
new file mode 100644 (file)
index 0000000..e912498
Binary files /dev/null and b/nlp_resource_data/nltk/test/unit/__pycache__/test_2x_compat.cpython-37.pyc differ
index ece381a..8b09c95 100644 (file)
Binary files a/nlp_resource_data/nltk/test/unit/__pycache__/test_aline.cpython-37.pyc and b/nlp_resource_data/nltk/test/unit/__pycache__/test_aline.cpython-37.pyc differ
index 5a5d98c..4770fe9 100644 (file)
Binary files a/nlp_resource_data/nltk/test/unit/__pycache__/test_brill.cpython-37.pyc and b/nlp_resource_data/nltk/test/unit/__pycache__/test_brill.cpython-37.pyc differ
diff --git a/nlp_resource_data/nltk/test/unit/__pycache__/test_cfd_mutation.cpython-37.pyc b/nlp_resource_data/nltk/test/unit/__pycache__/test_cfd_mutation.cpython-37.pyc
deleted file mode 100644 (file)
index 4d60415..0000000
Binary files a/nlp_resource_data/nltk/test/unit/__pycache__/test_cfd_mutation.cpython-37.pyc and /dev/null differ
diff --git a/nlp_resource_data/nltk/test/unit/__pycache__/test_cfg2chomsky.cpython-37.pyc b/nlp_resource_data/nltk/test/unit/__pycache__/test_cfg2chomsky.cpython-37.pyc
deleted file mode 100644 (file)
index eaa2fb5..0000000
Binary files a/nlp_resource_data/nltk/test/unit/__pycache__/test_cfg2chomsky.cpython-37.pyc and /dev/null differ
index c6d9f89..b9ab911 100644 (file)
Binary files a/nlp_resource_data/nltk/test/unit/__pycache__/test_chunk.cpython-37.pyc and b/nlp_resource_data/nltk/test/unit/__pycache__/test_chunk.cpython-37.pyc differ
index 82e8d40..3f89e36 100644 (file)
Binary files a/nlp_resource_data/nltk/test/unit/__pycache__/test_classify.cpython-37.pyc and b/nlp_resource_data/nltk/test/unit/__pycache__/test_classify.cpython-37.pyc differ
index 7431372..957a3ce 100644 (file)
Binary files a/nlp_resource_data/nltk/test/unit/__pycache__/test_collocations.cpython-37.pyc and b/nlp_resource_data/nltk/test/unit/__pycache__/test_collocations.cpython-37.pyc differ
index fd3e1a9..a8179e8 100644 (file)
Binary files a/nlp_resource_data/nltk/test/unit/__pycache__/test_concordance.cpython-37.pyc and b/nlp_resource_data/nltk/test/unit/__pycache__/test_concordance.cpython-37.pyc differ
index 06ebde8..06741c6 100644 (file)
Binary files a/nlp_resource_data/nltk/test/unit/__pycache__/test_corenlp.cpython-37.pyc and b/nlp_resource_data/nltk/test/unit/__pycache__/test_corenlp.cpython-37.pyc differ
index 1e3fa28..26a4114 100644 (file)
Binary files a/nlp_resource_data/nltk/test/unit/__pycache__/test_corpora.cpython-37.pyc and b/nlp_resource_data/nltk/test/unit/__pycache__/test_corpora.cpython-37.pyc differ
index 7110524..c44070a 100644 (file)
Binary files a/nlp_resource_data/nltk/test/unit/__pycache__/test_corpus_views.cpython-37.pyc and b/nlp_resource_data/nltk/test/unit/__pycache__/test_corpus_views.cpython-37.pyc differ
index d5bc73f..4aefd5c 100644 (file)
Binary files a/nlp_resource_data/nltk/test/unit/__pycache__/test_data.cpython-37.pyc and b/nlp_resource_data/nltk/test/unit/__pycache__/test_data.cpython-37.pyc differ
index fae667e..86a7e6c 100644 (file)
Binary files a/nlp_resource_data/nltk/test/unit/__pycache__/test_disagreement.cpython-37.pyc and b/nlp_resource_data/nltk/test/unit/__pycache__/test_disagreement.cpython-37.pyc differ
diff --git a/nlp_resource_data/nltk/test/unit/__pycache__/test_freqdist.cpython-37.pyc b/nlp_resource_data/nltk/test/unit/__pycache__/test_freqdist.cpython-37.pyc
deleted file mode 100644 (file)
index c197f56..0000000
Binary files a/nlp_resource_data/nltk/test/unit/__pycache__/test_freqdist.cpython-37.pyc and /dev/null differ
index 332a391..e0810a3 100644 (file)
Binary files a/nlp_resource_data/nltk/test/unit/__pycache__/test_hmm.cpython-37.pyc and b/nlp_resource_data/nltk/test/unit/__pycache__/test_hmm.cpython-37.pyc differ
index 5ceab30..91c9a9d 100644 (file)
Binary files a/nlp_resource_data/nltk/test/unit/__pycache__/test_json2csv_corpus.cpython-37.pyc and b/nlp_resource_data/nltk/test/unit/__pycache__/test_json2csv_corpus.cpython-37.pyc differ
diff --git a/nlp_resource_data/nltk/test/unit/__pycache__/test_json_serialization.cpython-37.pyc b/nlp_resource_data/nltk/test/unit/__pycache__/test_json_serialization.cpython-37.pyc
deleted file mode 100644 (file)
index ae73510..0000000
Binary files a/nlp_resource_data/nltk/test/unit/__pycache__/test_json_serialization.cpython-37.pyc and /dev/null differ
index 1cd2936..34f3af4 100644 (file)
Binary files a/nlp_resource_data/nltk/test/unit/__pycache__/test_naivebayes.cpython-37.pyc and b/nlp_resource_data/nltk/test/unit/__pycache__/test_naivebayes.cpython-37.pyc differ
diff --git a/nlp_resource_data/nltk/test/unit/__pycache__/test_nombank.cpython-37.pyc b/nlp_resource_data/nltk/test/unit/__pycache__/test_nombank.cpython-37.pyc
deleted file mode 100644 (file)
index 6f65ef6..0000000
Binary files a/nlp_resource_data/nltk/test/unit/__pycache__/test_nombank.cpython-37.pyc and /dev/null differ
diff --git a/nlp_resource_data/nltk/test/unit/__pycache__/test_pl196x.cpython-37.pyc b/nlp_resource_data/nltk/test/unit/__pycache__/test_pl196x.cpython-37.pyc
deleted file mode 100644 (file)
index 056626f..0000000
Binary files a/nlp_resource_data/nltk/test/unit/__pycache__/test_pl196x.cpython-37.pyc and /dev/null differ
index 7a4365f..e8a2725 100644 (file)
Binary files a/nlp_resource_data/nltk/test/unit/__pycache__/test_pos_tag.cpython-37.pyc and b/nlp_resource_data/nltk/test/unit/__pycache__/test_pos_tag.cpython-37.pyc differ
index 0cffd6a..c984006 100644 (file)
Binary files a/nlp_resource_data/nltk/test/unit/__pycache__/test_rte_classify.cpython-37.pyc and b/nlp_resource_data/nltk/test/unit/__pycache__/test_rte_classify.cpython-37.pyc differ
index 3c370a5..a0c09b1 100644 (file)
Binary files a/nlp_resource_data/nltk/test/unit/__pycache__/test_seekable_unicode_stream_reader.cpython-37.pyc and b/nlp_resource_data/nltk/test/unit/__pycache__/test_seekable_unicode_stream_reader.cpython-37.pyc differ
index 3b33bc8..2d04fb9 100644 (file)
Binary files a/nlp_resource_data/nltk/test/unit/__pycache__/test_senna.cpython-37.pyc and b/nlp_resource_data/nltk/test/unit/__pycache__/test_senna.cpython-37.pyc differ
index d78e5d3..dc55d3b 100644 (file)
Binary files a/nlp_resource_data/nltk/test/unit/__pycache__/test_stem.cpython-37.pyc and b/nlp_resource_data/nltk/test/unit/__pycache__/test_stem.cpython-37.pyc differ
index f110b10..9be0186 100644 (file)
Binary files a/nlp_resource_data/nltk/test/unit/__pycache__/test_tag.cpython-37.pyc and b/nlp_resource_data/nltk/test/unit/__pycache__/test_tag.cpython-37.pyc differ
index 294e480..9f6d092 100644 (file)
Binary files a/nlp_resource_data/nltk/test/unit/__pycache__/test_tgrep.cpython-37.pyc and b/nlp_resource_data/nltk/test/unit/__pycache__/test_tgrep.cpython-37.pyc differ
index b18da62..4a2306a 100644 (file)
Binary files a/nlp_resource_data/nltk/test/unit/__pycache__/test_tokenize.cpython-37.pyc and b/nlp_resource_data/nltk/test/unit/__pycache__/test_tokenize.cpython-37.pyc differ
index 2bc7807..15975a5 100644 (file)
Binary files a/nlp_resource_data/nltk/test/unit/__pycache__/test_twitter_auth.cpython-37.pyc and b/nlp_resource_data/nltk/test/unit/__pycache__/test_twitter_auth.cpython-37.pyc differ
index 17475b9..1122826 100644 (file)
Binary files a/nlp_resource_data/nltk/test/unit/__pycache__/test_wordnet.cpython-37.pyc and b/nlp_resource_data/nltk/test/unit/__pycache__/test_wordnet.cpython-37.pyc differ
index 688638f..8f2a1f1 100644 (file)
Binary files a/nlp_resource_data/nltk/test/unit/__pycache__/utils.cpython-37.pyc and b/nlp_resource_data/nltk/test/unit/__pycache__/utils.cpython-37.pyc differ
index fc66cb5..130b497 100644 (file)
Binary files a/nlp_resource_data/nltk/test/unit/lm/__pycache__/__init__.cpython-37.pyc and b/nlp_resource_data/nltk/test/unit/lm/__pycache__/__init__.cpython-37.pyc differ
index 7eec05f..1cb5393 100644 (file)
Binary files a/nlp_resource_data/nltk/test/unit/lm/__pycache__/test_counter.cpython-37.pyc and b/nlp_resource_data/nltk/test/unit/lm/__pycache__/test_counter.cpython-37.pyc differ
index 592d6df..d32131e 100644 (file)
Binary files a/nlp_resource_data/nltk/test/unit/lm/__pycache__/test_models.cpython-37.pyc and b/nlp_resource_data/nltk/test/unit/lm/__pycache__/test_models.cpython-37.pyc differ
index 2afa322..e81feda 100644 (file)
Binary files a/nlp_resource_data/nltk/test/unit/lm/__pycache__/test_preprocessing.cpython-37.pyc and b/nlp_resource_data/nltk/test/unit/lm/__pycache__/test_preprocessing.cpython-37.pyc differ
index f85a3a9..08fdd71 100644 (file)
Binary files a/nlp_resource_data/nltk/test/unit/lm/__pycache__/test_vocabulary.cpython-37.pyc and b/nlp_resource_data/nltk/test/unit/lm/__pycache__/test_vocabulary.cpython-37.pyc differ
index f7182cf..31fab79 100644 (file)
@@ -1,12 +1,15 @@
+# -*- coding: utf-8 -*-
 # Natural Language Toolkit: Language Model Unit Tests
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Ilia Kurenkov <ilia.kurenkov@gmail.com>
 # URL: <http://nltk.org/>
 # For license information, see LICENSE.TXT
 
 import unittest
 
+import six
+
 from nltk import FreqDist
 from nltk.lm import NgramCounter
 from nltk.util import everygrams
@@ -51,8 +54,8 @@ class NgramCounterTests(unittest.TestCase):
         bigrams = self.trigram_counter[2]
         trigrams = self.trigram_counter[3]
 
-        self.assertCountEqual(expected_bigram_contexts, bigrams.conditions())
-        self.assertCountEqual(expected_trigram_contexts, trigrams.conditions())
+        six.assertCountEqual(self, expected_bigram_contexts, bigrams.conditions())
+        six.assertCountEqual(self, expected_trigram_contexts, trigrams.conditions())
 
     def test_bigram_counts_seen_ngrams(self):
         b_given_a_count = 1
@@ -102,7 +105,7 @@ class NgramCounterTrainingTests(unittest.TestCase):
 
         self.assertFalse(bool(counter[3]))
         self.assertFalse(bool(counter[2]))
-        self.assertCountEqual(words, counter[1].keys())
+        six.assertCountEqual(self, words, counter[1].keys())
 
     def test_train_on_illegal_sentences(self):
         str_sent = ["Check", "this", "out", "!"]
@@ -127,6 +130,6 @@ class NgramCounterTrainingTests(unittest.TestCase):
         bigram_contexts = [("a",), ("c",)]
         trigram_contexts = [("e", "f")]
 
-        self.assertCountEqual(unigrams, counter[1].keys())
-        self.assertCountEqual(bigram_contexts, counter[2].keys())
-        self.assertCountEqual(trigram_contexts, counter[3].keys())
+        six.assertCountEqual(self, unigrams, counter[1].keys())
+        six.assertCountEqual(self, bigram_contexts, counter[2].keys())
+        six.assertCountEqual(self, trigram_contexts, counter[3].keys())
index f39619e..f19edd4 100644 (file)
@@ -1,14 +1,18 @@
+# -*- coding: utf-8 -*-
 # Natural Language Toolkit: Language Model Unit Tests
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Ilia Kurenkov <ilia.kurenkov@gmail.com>
 # URL: <http://nltk.org/>
 # For license information, see LICENSE.TXT
 
+from __future__ import division
 
 import math
+import sys
 import unittest
 
+from six import add_metaclass
 
 from nltk.lm import (
     Vocabulary,
@@ -53,11 +57,19 @@ class ParametrizeTestsMeta(type):
             dct["test_score_{0}".format(i)] = cls.add_score_test(
                 word, context, expected_score
             )
-        return super().__new__(cls, name, bases, dct)
+        return super(ParametrizeTestsMeta, cls).__new__(cls, name, bases, dct)
 
     @classmethod
     def add_score_test(cls, word, context, expected_score):
-        message = "word='{word}', context={context}"
+        if sys.version_info > (3, 5):
+            message = "word='{word}', context={context}"
+        else:
+            # Python 2 doesn't report the mismatched values if we pass a custom
+            # message, so we have to report them manually.
+            message = (
+                "{score} != {expected_score} within 4 places, "
+                "word='{word}', context={context}"
+            )
 
         def test_method(self):
             score = self.model.score(word, context)
@@ -76,8 +88,9 @@ class ParametrizeTestsMeta(type):
         return test
 
 
-class MleBigramTests(unittest.TestCase, metaclass=ParametrizeTestsMeta):
-    """Unit tests for MLE ngram model."""
+@add_metaclass(ParametrizeTestsMeta)
+class MleBigramTests(unittest.TestCase):
+    """unit tests for MLENgramModel class"""
 
     score_tests = [
         ("d", ["c"], 1),
@@ -155,7 +168,8 @@ class MleBigramTests(unittest.TestCase, metaclass=ParametrizeTestsMeta):
         self.assertAlmostEqual(perplexity, self.model.perplexity(text), places=4)
 
 
-class MleTrigramTests(unittest.TestCase, metaclass=ParametrizeTestsMeta):
+@add_metaclass(ParametrizeTestsMeta)
+class MleTrigramTests(unittest.TestCase):
     """MLE trigram model tests"""
 
     score_tests = [
@@ -179,8 +193,9 @@ class MleTrigramTests(unittest.TestCase, metaclass=ParametrizeTestsMeta):
         self.model.fit(training_text)
 
 
-class LidstoneBigramTests(unittest.TestCase, metaclass=ParametrizeTestsMeta):
-    """Unit tests for Lidstone class"""
+@add_metaclass(ParametrizeTestsMeta)
+class LidstoneBigramTests(unittest.TestCase):
+    """unit tests for Lidstone class"""
 
     score_tests = [
         # count(d | c) = 1
@@ -237,7 +252,8 @@ class LidstoneBigramTests(unittest.TestCase, metaclass=ParametrizeTestsMeta):
         self.assertAlmostEqual(perplexity, self.model.perplexity(text), places=4)
 
 
-class LidstoneTrigramTests(unittest.TestCase, metaclass=ParametrizeTestsMeta):
+@add_metaclass(ParametrizeTestsMeta)
+class LidstoneTrigramTests(unittest.TestCase):
     score_tests = [
         # Logic behind this is the same as for bigram model
         ("d", ["c"], 1.1 / 1.8),
@@ -254,8 +270,9 @@ class LidstoneTrigramTests(unittest.TestCase, metaclass=ParametrizeTestsMeta):
         self.model.fit(training_text)
 
 
-class LaplaceBigramTests(unittest.TestCase, metaclass=ParametrizeTestsMeta):
-    """Unit tests for Laplace class"""
+@add_metaclass(ParametrizeTestsMeta)
+class LaplaceBigramTests(unittest.TestCase):
+    """unit tests for Laplace class"""
 
     score_tests = [
         # basic sanity-check:
@@ -314,7 +331,8 @@ class LaplaceBigramTests(unittest.TestCase, metaclass=ParametrizeTestsMeta):
         self.assertAlmostEqual(perplexity, self.model.perplexity(text), places=4)
 
 
-class WittenBellInterpolatedTrigramTests(unittest.TestCase, metaclass=ParametrizeTestsMeta):
+@add_metaclass(ParametrizeTestsMeta)
+class WittenBellInterpolatedTrigramTests(unittest.TestCase):
     def setUp(self):
         vocab, training_text = _prepare_test_data(3)
         self.model = WittenBellInterpolated(3, vocabulary=vocab)
@@ -339,13 +357,11 @@ class WittenBellInterpolatedTrigramTests(unittest.TestCase, metaclass=Parametriz
         # gamma(['a', 'b']) = 0.0667
         # mle("c", ["a", "b"]) = 1
         ("c", ["a", "b"], (1 - 0.0667) + 0.0667 * ((1 - 0.1111) * 0.5 + 0.1111 / 18)),
-        # The ngram 'z b c' was not seen, so we should simply revert to
-        # the score of the ngram 'b c'. See issue #2332.
-        ("c", ["z", "b"], ((1 - 0.1111) * 0.5 + 0.1111 / 18)),
     ]
 
 
-class KneserNeyInterpolatedTrigramTests(unittest.TestCase, metaclass=ParametrizeTestsMeta):
+@add_metaclass(ParametrizeTestsMeta)
+class KneserNeyInterpolatedTrigramTests(unittest.TestCase):
     def setUp(self):
         vocab, training_text = _prepare_test_data(3)
         self.model = KneserNeyInterpolated(3, vocabulary=vocab)
@@ -370,14 +386,11 @@ class KneserNeyInterpolatedTrigramTests(unittest.TestCase, metaclass=Parametrize
         # gamma(['a', 'b']) = 0.1 * 1
         # normalizer = total number of trigrams with prefix "ab" = 1 => we can ignore it!
         ("c", ["a", "b"], 0.9 + 0.1 * ((0.9 + 0.2 * (1 / 8)) / 2)),
-        # The ngram 'z b c' was not seen, so we should simply revert to
-        # the score of the ngram 'b c'. See issue #2332.
-        ("c", ["z", "b"], ((0.9 + 0.2 * (1 / 8)) / 2)),
     ]
 
 
 class NgramModelTextGenerationTests(unittest.TestCase):
-    """Using MLE model, generate some text."""
+    """Using MLE estimator, generate some text."""
 
     def setUp(self):
         vocab, training_text = _prepare_test_data(3)
@@ -399,14 +412,10 @@ class NgramModelTextGenerationTests(unittest.TestCase):
             self.model.generate(text_seed=("a", "<s>"), random_seed=2), "a"
         )
 
-    def test_generate_cycle(self):
-        # Add a cycle to the model: bd -> b, db -> d
-        more_training_text = [list(padded_everygrams(self.model.order, list("bdbdbd")))]
-        self.model.fit(more_training_text)
-        # Test that we can escape the cycle
+    def test_generate_no_seed_unigrams(self):
         self.assertEqual(
-            self.model.generate(7, text_seed=("b", "d"), random_seed=5),
-            ["b", "d", "b", "d", "b", "d", "</s>"],
+            self.model.generate(5, random_seed=3),
+            ["<UNK>", "</s>", "</s>", "</s>", "</s>"],
         )
 
     def test_generate_with_text_seed(self):
index c298552..02a8af5 100644 (file)
@@ -1,6 +1,7 @@
+# -*- coding: utf-8 -*-
 # Natural Language Toolkit: Language Model Unit Tests
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Ilia Kurenkov <ilia.kurenkov@gmail.com>
 # URL: <http://nltk.org/>
 # For license information, see LICENSE.TXT
index db82eb5..dd78b42 100644 (file)
@@ -1,6 +1,7 @@
+# -*- coding: utf-8 -*-
 # Natural Language Toolkit: Language Model Unit Tests
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Ilia Kurenkov <ilia.kurenkov@gmail.com>
 # URL: <http://nltk.org/>
 # For license information, see LICENSE.TXT
@@ -8,6 +9,7 @@
 import unittest
 from collections import Counter
 
+import six
 from nltk.lm import Vocabulary
 
 
@@ -59,8 +61,8 @@ class NgramModelVocabularyTests(unittest.TestCase):
         vocab_counts = ["a", "b", "c", "d", "e", "f", "g", "w", "z"]
         vocab_items = ["a", "b", "d", "e", "<UNK>"]
 
-        self.assertCountEqual(vocab_counts, list(self.vocab.counts.keys()))
-        self.assertCountEqual(vocab_items, list(self.vocab))
+        six.assertCountEqual(self, vocab_counts, list(self.vocab.counts.keys()))
+        six.assertCountEqual(self, vocab_items, list(self.vocab))
 
     def test_update_empty_vocab(self):
         empty = Vocabulary(unk_cutoff=2)
@@ -123,7 +125,8 @@ class NgramModelVocabularyTests(unittest.TestCase):
 
     def test_str(self):
         self.assertEqual(
-            str(self.vocab), "<Vocabulary with cutoff=2 unk_label='<UNK>' and 5 items>"
+            str(self.vocab),
+            ("<Vocabulary with cutoff=2 " "unk_label='<UNK>' and 5 items>"),
         )
 
     def test_creation_with_counter(self):
diff --git a/nlp_resource_data/nltk/test/unit/test_2x_compat.py b/nlp_resource_data/nltk/test/unit/test_2x_compat.py
new file mode 100644 (file)
index 0000000..f078373
--- /dev/null
@@ -0,0 +1,65 @@
+# -*- coding: utf-8 -*-
+"""
+Unit tests for nltk.compat.
+See also nltk/test/compat.doctest.
+"""
+from __future__ import absolute_import, unicode_literals
+import unittest
+
+from nltk.text import Text
+from nltk.compat import PY3, python_2_unicode_compatible
+
+
+def setup_module(module):
+    from nose import SkipTest
+
+    if PY3:
+        raise SkipTest("test_2x_compat is for testing nltk.compat under Python 2.x")
+
+
+class TestTextTransliteration(unittest.TestCase):
+    txt = Text(["São", "Tomé", "and", "Príncipe"])
+
+    def test_repr(self):
+        self.assertEqual(repr(self.txt), br"<Text: S\xe3o Tom\xe9 and Pr\xedncipe...>")
+
+    def test_str(self):
+        self.assertEqual(str(self.txt), b"<Text: Sao Tome and Principe...>")
+
+
+class TestFraction(unittest.TestCase):
+    def test_unnoramlize_fraction(self):
+        from fractions import Fraction as NativePythonFraction
+        from nltk.compat import Fraction as NLTKFraction
+
+        # The native fraction should throw a TypeError in Python < 3.5
+        with self.assertRaises(TypeError):
+            NativePythonFraction(0, 1000, _normalize=False)
+
+        # Using nltk.compat.Fraction in Python < 3.5
+        compat_frac = NLTKFraction(0, 1000, _normalize=False)
+        # The numerator and denominator does not change.
+        assert compat_frac.numerator == 0
+        assert compat_frac.denominator == 1000
+        # The floating point value remains normalized.
+        assert float(compat_frac) == 0.0
+
+        # Checks that the division is not divided by
+        # # by greatest common divisor (gcd).
+        six_twelve = NLTKFraction(6, 12, _normalize=False)
+        assert six_twelve.numerator == 6
+        assert six_twelve.denominator == 12
+
+        one_two = NLTKFraction(1, 2, _normalize=False)
+        assert one_two.numerator == 1
+        assert one_two.denominator == 2
+
+        # Checks against the native fraction.
+        six_twelve_original = NativePythonFraction(6, 12)
+        # Checks that rational values of one_two and six_twelve is the same.
+        assert float(one_two) == float(six_twelve) == float(six_twelve_original)
+
+        # Checks that the fraction does get normalized, even when
+        # _normalize == False when numerator is using native
+        # fractions.Fraction.from_float
+        assert NLTKFraction(3.142, _normalize=False) == NativePythonFraction(3.142)
index f63d211..72b92c7 100644 (file)
@@ -3,6 +3,7 @@
 Unit tests for nltk.metrics.aline
 """
 
+from __future__ import unicode_literals
 
 import unittest
 
diff --git a/nlp_resource_data/nltk/test/unit/test_cfd_mutation.py b/nlp_resource_data/nltk/test/unit/test_cfd_mutation.py
deleted file mode 100644 (file)
index 7e21d7e..0000000
+++ /dev/null
@@ -1,39 +0,0 @@
-import unittest
-from nltk import ConditionalFreqDist, tokenize
-
-class TestEmptyCondFreq(unittest.TestCase):
-    def test_tabulate(self):
-        empty = ConditionalFreqDist()
-        self.assertEqual(empty.conditions(),[])
-        try:
-            empty.tabulate(conditions="BUG") # nonexistent keys shouldn't be added
-        except:
-            pass
-        self.assertEqual(empty.conditions(), [])
-
-
-    def test_plot(self):
-        empty = ConditionalFreqDist()
-        self.assertEqual(empty.conditions(),[])
-        try:
-            empty.plot(conditions=["BUG"]) # nonexistent keys shouldn't be added
-        except:
-            pass
-        self.assertEqual(empty.conditions(),[])
-
-    def test_increment(self):
-        # make sure that we can still mutate cfd normally
-        text = "cow cat mouse cat tiger"
-        cfd = ConditionalFreqDist()
-
-        # create cfd with word length as condition 
-        for word in tokenize.word_tokenize(text):
-            condition = len(word)
-            cfd[condition][word] += 1
-
-        self.assertEqual(cfd.conditions(), [3,5])
-
-        # incrementing previously unseen key is still possible
-        cfd[2]['hi'] += 1
-        self.assertEqual(set(cfd.conditions()),set([3,5,2])) # new condition added
-        self.assertEqual(cfd[2]['hi'], 1) # key's frequency incremented from 0 (unseen) to 1
diff --git a/nlp_resource_data/nltk/test/unit/test_cfg2chomsky.py b/nlp_resource_data/nltk/test/unit/test_cfg2chomsky.py
deleted file mode 100644 (file)
index 686861e..0000000
+++ /dev/null
@@ -1,49 +0,0 @@
-# -*- coding: utf-8 -*-
-import unittest
-import nltk
-from nltk.grammar import CFG
-
-
-class ChomskyNormalFormForCFGTest(unittest.TestCase):
-    def test_simple(self):
-        grammar = CFG.fromstring(
-            """
-          S -> NP VP
-          PP -> P NP
-          NP -> Det N | NP PP P
-          VP -> V NP | VP PP
-          VP -> Det
-          Det -> 'a' | 'the'
-          N -> 'dog' | 'cat'
-          V -> 'chased' | 'sat'
-          P -> 'on' | 'in'
-        """
-        )
-        self.assertFalse(grammar.is_flexible_chomsky_normal_form())
-        self.assertFalse(grammar.is_chomsky_normal_form())
-        grammar = grammar.chomsky_normal_form(flexible=True)
-        self.assertTrue(grammar.is_flexible_chomsky_normal_form())
-        self.assertFalse(grammar.is_chomsky_normal_form())
-
-        grammar2 = CFG.fromstring(
-            """
-          S -> NP VP
-          NP -> VP N P
-          VP -> P
-          N -> 'dog' | 'cat'
-          P -> 'on' | 'in'
-        """
-        )
-        self.assertFalse(grammar2.is_flexible_chomsky_normal_form())
-        self.assertFalse(grammar2.is_chomsky_normal_form())
-        grammar2 = grammar2.chomsky_normal_form()
-        self.assertTrue(grammar2.is_flexible_chomsky_normal_form())
-        self.assertTrue(grammar2.is_chomsky_normal_form())
-
-    def test_complex(self):
-        grammar = nltk.data.load('grammars/large_grammars/atis.cfg')
-        self.assertFalse(grammar.is_flexible_chomsky_normal_form())
-        self.assertFalse(grammar.is_chomsky_normal_form())
-        grammar = grammar.chomsky_normal_form(flexible=True)
-        self.assertTrue(grammar.is_flexible_chomsky_normal_form())
-        self.assertFalse(grammar.is_chomsky_normal_form())
index 7d61518..8c40dfc 100644 (file)
@@ -1,4 +1,5 @@
 # -*- coding: utf-8 -*-
+from __future__ import absolute_import, unicode_literals
 import unittest
 
 from nltk import RegexpParser
index 4dae5d1..e9128d2 100644 (file)
@@ -2,6 +2,7 @@
 """
 Unit tests for nltk.classify. See also: nltk/test/classify.doctest
 """
+from __future__ import absolute_import
 from nose import SkipTest
 from nltk import classify
 
index 8949411..8e3535f 100644 (file)
@@ -1,4 +1,5 @@
 # -*- coding: utf-8 -*-
+from __future__ import absolute_import, unicode_literals
 import unittest
 
 from nltk.collocations import BigramCollocationFinder
index 83e407b..81ac47b 100644 (file)
@@ -1,15 +1,20 @@
 # -*- coding: utf-8 -*-
+from __future__ import absolute_import, unicode_literals
 
 import unittest
 import contextlib
 import sys
-from io import StringIO
 
 from nose import with_setup
 
 from nltk.corpus import gutenberg
 from nltk.text import Text
 
+try:
+    from StringIO import StringIO
+except ImportError as e:
+    from io import StringIO
+
 
 @contextlib.contextmanager
 def stdout_redirect(where):
index 966ecc6..fed13e3 100644 (file)
@@ -7,8 +7,11 @@ Mock test for Stanford CoreNLP wrappers.
 import sys
 from itertools import chain
 from unittest import TestCase, SkipTest
-from unittest.mock import MagicMock
 
+try:
+    from unittest.mock import MagicMock
+except ImportError:
+    raise SkipTest('unittest.mock no supported in Python2')
 from nltk.tree import Tree
 from nltk.parse import corenlp
 
@@ -1085,7 +1088,7 @@ class TestParserAPI(TestCase):
 
         corenlp_parser.api_call.assert_called_once_with(
             "The quick brown fox jumps over the lazy dog",
-            properties={'ssplit.eolonly': 'true'},
+            properties={'ssplit.ssplit.eolonly': 'true'},
         )
         self.assertEqual(expected_output, parsed_data)
 
@@ -1411,6 +1414,6 @@ class TestParserAPI(TestCase):
 
         corenlp_parser.api_call.assert_called_once_with(
             "The quick brown fox jumps over the lazy dog",
-            properties={'ssplit.eolonly': 'true'},
+            properties={'ssplit.ssplit.eolonly': 'true'},
         )
         self.assertEqual(expected_output, parsed_data.tree())
index 8b105b8..bce083b 100644 (file)
@@ -1,4 +1,5 @@
 # -*- coding: utf-8 -*-
+from __future__ import absolute_import, unicode_literals
 import unittest
 
 from nltk.corpus import (
@@ -12,6 +13,7 @@ from nltk.corpus import (
     udhr,
 )  # mwa_ppdb
 
+from nltk.compat import python_2_unicode_compatible
 from nltk.tree import Tree
 from nltk.test.unit.utils import skipIf
 
index 29d8a3c..222385a 100644 (file)
@@ -2,6 +2,7 @@
 """
 Corpus View Regression Tests
 """
+from __future__ import absolute_import, unicode_literals
 import unittest
 import nltk.data
 from nltk.corpus.reader.util import (
index 6a88868..3054868 100644 (file)
@@ -1,4 +1,5 @@
 # -*- coding: utf-8 -*-
+from __future__ import absolute_import, unicode_literals
 import unittest
 
 from nltk.metrics.agreement import AnnotationTask
diff --git a/nlp_resource_data/nltk/test/unit/test_freqdist.py b/nlp_resource_data/nltk/test/unit/test_freqdist.py
deleted file mode 100644 (file)
index a73fd02..0000000
+++ /dev/null
@@ -1,16 +0,0 @@
-import unittest
-import nltk
-
-
-class TestFreqDist(unittest.TestCase):
-
-    def test_iterating_returns_an_iterator_ordered_by_frequency(self):
-
-        samples = ['one', 'two', 'two']
-
-        distribution = nltk.FreqDist(samples)
-
-        most_frequent, less_frequent = [entry for entry in distribution]
-
-        self.assertEqual(most_frequent, 'two')
-        self.assertEqual(less_frequent, 'one')
index b9770ca..d211bc2 100644 (file)
@@ -1,4 +1,5 @@
 # -*- coding: utf-8 -*-
+from __future__ import absolute_import, unicode_literals
 from nltk.tag import hmm
 
 
index 6714d9c..ac61a65 100644 (file)
@@ -1,7 +1,7 @@
 # -*- coding: utf-8 -*-
 # Natural Language Toolkit: Twitter client
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Lorenzo Rubio <lrnzcig@gmail.com>
 # URL: <http://nltk.org/>
 # For license information, see LICENSE.TXT
@@ -14,8 +14,10 @@ package.
 
 import os
 import unittest
-from tempfile import TemporaryDirectory
 
+from six.moves import zip
+
+from nltk.compat import TemporaryDirectory
 from nltk.corpus import twitter_samples
 from nltk.twitter.common import json2csv, json2csv_entities
 
diff --git a/nlp_resource_data/nltk/test/unit/test_json_serialization.py b/nlp_resource_data/nltk/test/unit/test_json_serialization.py
deleted file mode 100644 (file)
index 4667fbf..0000000
+++ /dev/null
@@ -1,87 +0,0 @@
-import unittest
-
-from nltk.corpus import brown
-from nltk.jsontags import JSONTaggedDecoder, JSONTaggedEncoder
-from nltk.tag import DefaultTagger, RegexpTagger, AffixTagger
-from nltk.tag import UnigramTagger, BigramTagger, TrigramTagger, NgramTagger
-from nltk.tag import PerceptronTagger
-from nltk.tag import BrillTaggerTrainer, BrillTagger
-from nltk.tag.brill import nltkdemo18
-
-    
-class TestJSONSerialization(unittest.TestCase):
-    def setUp(self):
-        self.corpus = brown.tagged_sents()[:35]
-        self.decoder = JSONTaggedDecoder()
-        self.encoder = JSONTaggedEncoder()
-        self.default_tagger = DefaultTagger("NN")
-
-    def test_default_tagger(self):
-        encoded = self.encoder.encode(self.default_tagger)
-        decoded = self.decoder.decode(encoded)
-
-        self.assertEqual(repr(self.default_tagger), repr(decoded))
-        self.assertEqual(self.default_tagger._tag, decoded._tag)
-
-    def test_regexp_tagger(self):
-        tagger = RegexpTagger([(r".*", "NN")], backoff=self.default_tagger)
-
-        encoded = self.encoder.encode(tagger)
-        decoded = self.decoder.decode(encoded)
-
-        self.assertEqual(repr(tagger), repr(decoded))
-        self.assertEqual(repr(tagger.backoff), repr(decoded.backoff))
-        self.assertEqual(tagger._regexps, decoded._regexps)
-
-    def test_affix_tagger(self):
-        tagger = AffixTagger(self.corpus, backoff=self.default_tagger)
-
-        encoded = self.encoder.encode(tagger)
-        decoded = self.decoder.decode(encoded)
-
-        self.assertEqual(repr(tagger), repr(decoded))
-        self.assertEqual(repr(tagger.backoff), repr(decoded.backoff))
-        self.assertEqual(tagger._affix_length, decoded._affix_length)
-        self.assertEqual(tagger._min_word_length, decoded._min_word_length)
-        self.assertEqual(tagger._context_to_tag, decoded._context_to_tag)
-
-    def test_ngram_taggers(self):
-        unitagger = UnigramTagger(self.corpus, backoff=self.default_tagger)
-        bitagger = BigramTagger(self.corpus, backoff=unitagger)
-        tritagger = TrigramTagger(self.corpus, backoff=bitagger)
-        ntagger = NgramTagger(4, self.corpus, backoff=tritagger)
-
-        encoded = self.encoder.encode(ntagger)
-        decoded = self.decoder.decode(encoded)
-
-        self.assertEqual(repr(ntagger), repr(decoded))
-        self.assertEqual(repr(tritagger), repr(decoded.backoff))
-        self.assertEqual(repr(bitagger), repr(decoded.backoff.backoff))
-        self.assertEqual(repr(unitagger), repr(decoded.backoff.backoff.backoff))
-        self.assertEqual(repr(self.default_tagger), 
-                         repr(decoded.backoff.backoff.backoff.backoff))
-
-    def test_perceptron_tagger(self):
-        tagger = PerceptronTagger(load=False)
-        tagger.train(self.corpus)
-
-        encoded = self.encoder.encode(tagger)
-        decoded = self.decoder.decode(encoded)
-
-        self.assertEqual(tagger.model.weights, decoded.model.weights)
-        self.assertEqual(tagger.tagdict, decoded.tagdict)
-        self.assertEqual(tagger.classes, decoded.classes)
-
-    def test_brill_tagger(self):
-        trainer = BrillTaggerTrainer(self.default_tagger, nltkdemo18(),
-                                     deterministic=True)
-        tagger = trainer.train(self.corpus, max_rules=30)
-
-        encoded = self.encoder.encode(tagger)
-        decoded = self.decoder.decode(encoded)
-
-        self.assertEqual(repr(tagger._initial_tagger),
-                         repr(decoded._initial_tagger))
-        self.assertEqual(tagger._rules, decoded._rules)
-        self.assertEqual(tagger._training_stats, decoded._training_stats)
-
index ac9ff9b..37e4411 100644 (file)
@@ -1,4 +1,5 @@
 # -*- coding: utf-8 -*-
+from __future__ import print_function, unicode_literals
 
 
 import unittest
diff --git a/nlp_resource_data/nltk/test/unit/test_nombank.py b/nlp_resource_data/nltk/test/unit/test_nombank.py
deleted file mode 100644 (file)
index 8f2d9d8..0000000
+++ /dev/null
@@ -1,27 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Unit tests for nltk.corpus.nombank
-"""
-
-import unittest
-
-from nltk.corpus import nombank
-# Load the nombank once.
-nombank.nouns()
-
-class NombankDemo(unittest.TestCase):
-    def test_numbers(self):
-        # No. of instances.
-        self.assertEqual(len(nombank.instances()), 114574)
-        # No. of rolesets
-        self.assertEqual(len(nombank.rolesets()), 5577)
-        # No. of nouns.
-        self.assertEqual(len(nombank.nouns()), 4704)
-
-
-    def test_instance(self):
-        self.assertEqual(nombank.instances()[0].roleset, 'perc-sign.01')
-
-    def test_framefiles_fileids(self):
-        self.assertEqual(len(nombank.fileids()), 4705)
-        self.assertTrue(all(fileid.endswith('.xml') for fileid in nombank.fileids()))
diff --git a/nlp_resource_data/nltk/test/unit/test_pl196x.py b/nlp_resource_data/nltk/test/unit/test_pl196x.py
deleted file mode 100644 (file)
index d90d94c..0000000
+++ /dev/null
@@ -1,14 +0,0 @@
-import unittest
-
-import nltk
-from nltk.corpus.reader import pl196x
-
-
-class TestCorpusViews(unittest.TestCase):
-
-    def test_corpus_reader(self):
-        pl196x_dir = nltk.data.find('corpora/pl196x')
-        pl = pl196x.Pl196xCorpusReader(pl196x_dir, r'.*\.xml',
-                                       textids='textids.txt',
-                                       cat_file='cats.txt')
-        pl.tagged_words(fileids=pl.fileids(), categories='cats.txt')
index 0aced19..a0aa1d0 100644 (file)
@@ -3,6 +3,7 @@
 Tests for nltk.pos_tag
 """
 
+from __future__ import unicode_literals
 
 import unittest
 
index 3ba2d06..b26298c 100644 (file)
@@ -1,4 +1,5 @@
 # -*- coding: utf-8 -*-
+from __future__ import print_function, unicode_literals
 
 import unittest
 
index c5d1583..a54c559 100644 (file)
@@ -3,6 +3,7 @@
 The following test performs a random series of reads, seeks, and
 tells, and checks that the results are consistent.
 """
+from __future__ import absolute_import, unicode_literals
 import random
 import functools
 from io import BytesIO
index be5fed0..8701225 100644 (file)
@@ -3,6 +3,7 @@
 Unit tests for Senna
 """
 
+from __future__ import unicode_literals
 from os import environ, path, sep
 
 import logging
index 52a0d66..67677df 100644 (file)
@@ -1,4 +1,5 @@
 # -*- coding: utf-8 -*-
+from __future__ import print_function, unicode_literals
 
 import os
 import unittest
@@ -40,8 +41,11 @@ class SnowballTest(unittest.TestCase):
         assert ar_stemmer.stem("الكلمات") == "كلم"
 
     def test_russian(self):
+        # Russian words both consisting of Cyrillic
+        # and Roman letters can be stemmed.
         stemmer_russian = SnowballStemmer("russian")
         assert stemmer_russian.stem("авантненькая") == "авантненьк"
+        assert stemmer_russian.stem("avenantnen'kai^a") == "avenantnen'k"
 
     def test_german(self):
         stemmer_german = SnowballStemmer("german")
index b460854..c382074 100644 (file)
@@ -1,4 +1,5 @@
 # -*- coding: utf-8 -*-
+from __future__ import absolute_import, unicode_literals
 
 
 def test_basic():
index f46b4ce..17b2c4a 100644 (file)
@@ -3,7 +3,7 @@
 #
 # Natural Language Toolkit: TGrep search
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Will Roberts <wildwilhelm@gmail.com>
 # URL: <http://nltk.org/>
 # For license information, see LICENSE.TXT
 Unit tests for nltk.tgrep.
 '''
 
+from __future__ import absolute_import, print_function, unicode_literals
 
 import unittest
 
+from six import b
+
 from nltk.tree import ParentedTree
 from nltk import tgrep
 
@@ -63,7 +66,7 @@ class TestSequenceFunctions(unittest.TestCase):
         Test that tokenization handles bytes and strs the same way.
         '''
         self.assertEqual(
-            tgrep.tgrep_tokenize(b'A .. (B !< C . D) | ![<< (E , F) $ G]'),
+            tgrep.tgrep_tokenize(b('A .. (B !< C . D) | ![<< (E , F) $ G]')),
             tgrep.tgrep_tokenize('A .. (B !< C . D) | ![<< (E , F) $ G]'),
         )
 
@@ -266,15 +269,15 @@ class TestSequenceFunctions(unittest.TestCase):
             '(S (NP (DT the) (JJ big) (NN dog)) ' '(VP bit) (NP (DT a) (NN cat)))'
         )
         self.assertEqual(
-            list(tgrep.tgrep_positions(b'NN', [tree])),
-            list(tgrep.tgrep_positions(b'NN', [tree])),
+            list(tgrep.tgrep_positions(b('NN'), [tree])),
+            list(tgrep.tgrep_positions('NN', [tree])),
         )
         self.assertEqual(
-            list(tgrep.tgrep_nodes(b'NN', [tree])),
+            list(tgrep.tgrep_nodes(b('NN'), [tree])),
             list(tgrep.tgrep_nodes('NN', [tree])),
         )
         self.assertEqual(
-            list(tgrep.tgrep_positions(b'NN|JJ', [tree])),
+            list(tgrep.tgrep_positions(b('NN|JJ'), [tree])),
             list(tgrep.tgrep_positions('NN|JJ', [tree])),
         )
 
index f3b80c5..fa0c286 100644 (file)
@@ -4,20 +4,15 @@ Unit tests for nltk.tokenize.
 See also nltk/test/tokenize.doctest
 """
 
+from __future__ import unicode_literals
 
+import os
 import unittest
 
 from nose import SkipTest
-from nose.tools import assert_equal
 
-from nltk.tokenize import (
-    punkt,
-    word_tokenize,
-    TweetTokenizer,
-    StanfordSegmenter,
-    TreebankWordTokenizer,
-    SyllableTokenizer,
-)
+from nltk.tokenize import word_tokenize
+from nltk.tokenize import TweetTokenizer, StanfordSegmenter, TreebankWordTokenizer
 
 
 class TestTokenize(unittest.TestCase):
@@ -42,14 +37,6 @@ class TestTokenize(unittest.TestCase):
             'français',
         ]
         self.assertEqual(tokens, expected)
-        
-    def test_sonority_sequencing_syllable_tokenizer(self):
-        """
-        Test SyllableTokenizer tokenizer.
-        """
-        tokenizer = SyllableTokenizer()
-        tokens = tokenizer.tokenize('justification')
-        self.assertEqual(tokens, ['jus', 'ti', 'fi', 'ca', 'tion'])
 
     def test_stanford_segmenter_arabic(self):
         """
@@ -108,25 +95,6 @@ class TestTokenize(unittest.TestCase):
         expected = ['(', '393', ')', "928 -3010"]
         result = tokenizer.tokenize(test2)
         self.assertEqual(result, expected)
-        
-    def test_pad_asterisk(self):
-        """
-        Test padding of asterisk for word tokenization.
-        """
-        text = "This is a, *weird sentence with *asterisks in it."
-        expected = ['This', 'is', 'a', ',', '*', 'weird', 'sentence', 
-                    'with', '*', 'asterisks', 'in', 'it', '.']
-        self.assertEqual(word_tokenize(text), expected)
-        
-    def test_pad_dotdot(self):
-        """
-        Test padding of dotdot* for word tokenization.
-        """
-        text = "Why did dotdot.. not get tokenized but dotdotdot... did? How about manydots....."
-        expected = ['Why', 'did', 'dotdot', '..', 'not', 'get', 
-                    'tokenized', 'but', 'dotdotdot', '...', 'did', '?', 
-                    'How', 'about', 'manydots', '.....']
-        self.assertEqual(word_tokenize(text), expected)
 
     def test_remove_handle(self):
         """
@@ -378,6 +346,7 @@ class TestTokenize(unittest.TestCase):
         result = list(tokenizer.span_tokenize(test3))
         self.assertEqual(result, expected)
 
+        
     def test_word_tokenize(self):
         """
         Test word_tokenize function
@@ -391,35 +360,3 @@ class TestTokenize(unittest.TestCase):
         sentence = "'v' 're'"
         expected = ["'", 'v', "'", "'re", "'"]
         self.assertEqual(word_tokenize(sentence), expected)
-
-    def test_punkt_pair_iter(self):
-
-        test_cases = [
-            ('12', [('1', '2'), ('2', None)]),
-            ('123', [('1', '2'), ('2', '3'), ('3', None)]),
-            ('1234', [('1', '2'), ('2', '3'), ('3', '4'), ('4', None)]),
-        ]
-
-        for (test_input, expected_output) in test_cases:
-            actual_output = [x for x in punkt._pair_iter(test_input)]
-
-            assert_equal(actual_output, expected_output)
-
-    def test_punkt_pair_iter_handles_stop_iteration_exception(self):
-        # test input to trigger StopIteration from next()
-        it = iter([])
-        # call method under test and produce a generator
-        gen = punkt._pair_iter(it)
-        # unpack generator, ensure that no error is raised
-        list(gen)
-
-    def test_punkt_tokenize_words_handles_stop_iteration_exception(self):
-        obj = punkt.PunktBaseClass()
-
-        class TestPunktTokenizeWordsMock:
-            def word_tokenize(self, s):
-                return iter([])
-
-        obj._lang_vars = TestPunktTokenizeWordsMock()
-        # unpack generator, ensure that no error is raised
-        list(obj._tokenize_words('test'))
index 08fd14a..f2191d3 100644 (file)
@@ -4,8 +4,8 @@ Unit tests for nltk.corpus.wordnet
 See also nltk/test/wordnet.doctest
 """
 
+from __future__ import unicode_literals
 
-import collections
 import os
 import unittest
 
@@ -195,26 +195,3 @@ class WordnNetDemo(unittest.TestCase):
         self.assertAlmostEqual(
             S('dog.n.01').lin_similarity(S('cat.n.01'), semcor_ic), 0.8863, places=3
         )
-
-    def test_omw_lemma_no_trailing_underscore(self):
-        expected = sorted([
-            u'popolna_sprememba_v_mišljenju',
-            u'popoln_obrat',
-            u'preobrat',
-            u'preobrat_v_mišljenju'
-            ])
-        self.assertEqual(sorted(S('about-face.n.02').lemma_names(lang='slv')), expected)
-
-    def test_iterable_type_for_all_lemma_names(self):
-        # Duck-test for iterables.
-        # See https://stackoverflow.com/a/36230057/610569
-        cat_lemmas = wn.all_lemma_names(lang='cat')
-        eng_lemmas = wn.all_lemma_names(lang='eng')
-
-        self.assertTrue(hasattr(eng_lemmas, '__iter__'))
-        self.assertTrue(hasattr(eng_lemmas, '__next__') or hasattr(eng_lemmas, 'next'))
-        self.assertTrue(eng_lemmas.__iter__() is eng_lemmas)
-
-        self.assertTrue(hasattr(cat_lemmas, '__iter__'))
-        self.assertTrue(hasattr(cat_lemmas, '__next__') or hasattr(eng_lemmas, 'next'))
-        self.assertTrue(cat_lemmas.__iter__() is cat_lemmas)
index 8a4115d..48b64c7 100644 (file)
Binary files a/nlp_resource_data/nltk/test/unit/translate/__pycache__/__init__.cpython-37.pyc and b/nlp_resource_data/nltk/test/unit/translate/__pycache__/__init__.cpython-37.pyc differ
index f84b75a..2dce884 100644 (file)
Binary files a/nlp_resource_data/nltk/test/unit/translate/__pycache__/test_bleu.cpython-37.pyc and b/nlp_resource_data/nltk/test/unit/translate/__pycache__/test_bleu.cpython-37.pyc differ
index e9f61e1..c973f8a 100644 (file)
Binary files a/nlp_resource_data/nltk/test/unit/translate/__pycache__/test_gdfa.cpython-37.pyc and b/nlp_resource_data/nltk/test/unit/translate/__pycache__/test_gdfa.cpython-37.pyc differ
index 77558d2..dc66b4d 100644 (file)
Binary files a/nlp_resource_data/nltk/test/unit/translate/__pycache__/test_ibm1.cpython-37.pyc and b/nlp_resource_data/nltk/test/unit/translate/__pycache__/test_ibm1.cpython-37.pyc differ
index 13b6a54..02014af 100644 (file)
Binary files a/nlp_resource_data/nltk/test/unit/translate/__pycache__/test_ibm2.cpython-37.pyc and b/nlp_resource_data/nltk/test/unit/translate/__pycache__/test_ibm2.cpython-37.pyc differ
index b3c6f66..316900a 100644 (file)
Binary files a/nlp_resource_data/nltk/test/unit/translate/__pycache__/test_ibm3.cpython-37.pyc and b/nlp_resource_data/nltk/test/unit/translate/__pycache__/test_ibm3.cpython-37.pyc differ
index 7a975f3..4288e58 100644 (file)
Binary files a/nlp_resource_data/nltk/test/unit/translate/__pycache__/test_ibm4.cpython-37.pyc and b/nlp_resource_data/nltk/test/unit/translate/__pycache__/test_ibm4.cpython-37.pyc differ
index a61b2c2..ed639a8 100644 (file)
Binary files a/nlp_resource_data/nltk/test/unit/translate/__pycache__/test_ibm5.cpython-37.pyc and b/nlp_resource_data/nltk/test/unit/translate/__pycache__/test_ibm5.cpython-37.pyc differ
index 2a75f20..fba69a8 100644 (file)
Binary files a/nlp_resource_data/nltk/test/unit/translate/__pycache__/test_ibm_model.cpython-37.pyc and b/nlp_resource_data/nltk/test/unit/translate/__pycache__/test_ibm_model.cpython-37.pyc differ
index ee088b1..5c7aacc 100644 (file)
Binary files a/nlp_resource_data/nltk/test/unit/translate/__pycache__/test_nist.cpython-37.pyc and b/nlp_resource_data/nltk/test/unit/translate/__pycache__/test_nist.cpython-37.pyc differ
index 9c024dd..10c1bcc 100644 (file)
Binary files a/nlp_resource_data/nltk/test/unit/translate/__pycache__/test_stack_decoder.cpython-37.pyc and b/nlp_resource_data/nltk/test/unit/translate/__pycache__/test_stack_decoder.cpython-37.pyc differ
index fd286f6..5d5f2d4 100644 (file)
@@ -1,7 +1,7 @@
 # -*- coding: utf-8 -*-
 # Natural Language Toolkit: Stack decoder
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Tah Wei Hoon <hoon.tw@gmail.com>
 # URL: <http://nltk.org/>
 # For license information, see LICENSE.TXT
index 8bd7346..0489b16 100644 (file)
@@ -1,4 +1,5 @@
 # -*- coding: utf-8 -*-
+from __future__ import absolute_import
 from unittest import TestCase
 from functools import wraps
 from nose.plugins.skip import SkipTest
index f2360ff..7ba6af1 100644 (file)
@@ -1,10 +1,11 @@
-.. Copyright (C) 2001-2020 NLTK Project
+.. Copyright (C) 2001-2019 NLTK Project
 .. For license information, see LICENSE.TXT
 
 =================
 Utility functions
 =================
 
+    >>> from __future__ import print_function
     >>> from nltk.util import *
     >>> from nltk.tree import Tree
 
index 54c5975..409504d 100644 (file)
@@ -1,4 +1,4 @@
-.. Copyright (C) 2001-2020 NLTK Project
+.. Copyright (C) 2001-2019 NLTK Project
 .. For license information, see LICENSE.TXT
 
 =================
@@ -6,6 +6,7 @@ WordNet Interface
 =================
 
 WordNet is just another NLTK corpus reader, and can be imported like this:
+    >>> from __future__ import print_function, unicode_literals
     >>> from nltk.corpus import wordnet
 
 For more compact code, we recommend:
@@ -52,31 +53,31 @@ WordNet, using ISO-639 language codes.
     'nob', 'pol', 'por', 'qcn', 'slv', 'spa', 'swe', 'tha', 'zsm']
     >>> wn.synsets(b'\xe7\x8a\xac'.decode('utf-8'), lang='jpn')
     [Synset('dog.n.01'), Synset('spy.n.01')]
-
+    
     wn.synset('spy.n.01').lemma_names('jpn') # doctest: +NORMALIZE_WHITESPACE
     ['\u3044\u306c', '\u307e\u308f\u3057\u8005', '\u30b9\u30d1\u30a4', '\u56de\u3057\u8005',
     '\u56de\u8005', '\u5bc6\u5075', '\u5de5\u4f5c\u54e1', '\u5efb\u3057\u8005',
     '\u5efb\u8005', '\u63a2', '\u63a2\u308a', '\u72ac', '\u79d8\u5bc6\u635c\u67fb\u54e1',
     '\u8adc\u5831\u54e1', '\u8adc\u8005', '\u9593\u8005', '\u9593\u8adc', '\u96a0\u5bc6']
-
+    
     >>> wn.synset('dog.n.01').lemma_names('ita')
     ['cane', 'Canis_familiaris']
     >>> wn.lemmas('cane', lang='ita') # doctest: +NORMALIZE_WHITESPACE
-    [Lemma('dog.n.01.cane'), Lemma('cramp.n.02.cane'), Lemma('hammer.n.01.cane'), Lemma('bad_person.n.01.cane'),
+    [Lemma('dog.n.01.cane'), Lemma('cramp.n.02.cane'), Lemma('hammer.n.01.cane'), Lemma('bad_person.n.01.cane'), 
     Lemma('incompetent.n.01.cane')]
     >>> sorted(wn.synset('dog.n.01').lemmas('dan')) # doctest: +NORMALIZE_WHITESPACE
     [Lemma('dog.n.01.hund'), Lemma('dog.n.01.k\xf8ter'),
     Lemma('dog.n.01.vovhund'), Lemma('dog.n.01.vovse')]
-
+    
     sorted(wn.synset('dog.n.01').lemmas('por'))
        [Lemma('dog.n.01.cachorra'), Lemma('dog.n.01.cachorro'), Lemma('dog.n.01.cadela'), Lemma('dog.n.01.c\xe3o')]
-
+    
     >>> dog_lemma = wn.lemma(b'dog.n.01.c\xc3\xa3o'.decode('utf-8'), lang='por')
     >>> dog_lemma
     Lemma('dog.n.01.c\xe3o')
     >>> dog_lemma.lang()
     'por'
-    >>> len(list(wordnet.all_lemma_names(pos='n', lang='jpn')))
+    >>> len(wordnet.all_lemma_names(pos='n', lang='jpn'))
     64797
 
 -------
@@ -430,7 +431,7 @@ Compute transitive closures of synsets
      Synset('leonberg.n.01'), Synset('mexican_hairless.n.01'),
      Synset('newfoundland.n.01'), Synset('pooch.n.01'), Synset('poodle.n.01'), ...]
     >>> list(dog.closure(hyper)) # doctest: +NORMALIZE_WHITESPACE
-    [Synset('canine.n.02'), Synset('domestic_animal.n.01'), Synset('carnivore.n.01'), Synset('animal.n.01'),
+    [Synset('canine.n.02'), Synset('domestic_animal.n.01'), Synset('carnivore.n.01'), Synset('animal.n.01'), 
     Synset('placental.n.01'), Synset('organism.n.01'), Synset('mammal.n.01'), Synset('living_thing.n.01'),
     Synset('vertebrate.n.01'), Synset('whole.n.02'), Synset('chordate.n.01'), Synset('object.n.01'),
     Synset('physical_entity.n.01'), Synset('entity.n.01')]
index 09ba27c..1412c0d 100644 (file)
@@ -1,4 +1,5 @@
 # -*- coding: utf-8 -*-
+from __future__ import absolute_import
 
 
 def teardown_module(module=None):
index d92b5a1..c2536b4 100644 (file)
@@ -1,4 +1,4 @@
-.. Copyright (C) 2001-2020 NLTK Project
+.. Copyright (C) 2001-2019 NLTK Project
 .. For license information, see LICENSE.TXT
 
 ===============================
index 28cf0e9..b4d8f90 100644 (file)
@@ -1,4 +1,4 @@
-.. Copyright (C) 2001-2020 NLTK Project
+.. Copyright (C) 2001-2019 NLTK Project
 .. For license information, see LICENSE.TXT
 
 .. -*- coding: utf-8 -*-
index fc2731f..0fa9c3c 100644 (file)
@@ -1,6 +1,6 @@
 # Natural Language Toolkit: Texts
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Steven Bird <stevenbird1@gmail.com>
 #         Edward Loper <edloper@gmail.com>
 # URL: <http://nltk.org/>
@@ -13,25 +13,25 @@ Functionality includes: concordancing, collocation discovery,
 regular expression search over tokenized strings, and
 distributional similarity.
 """
+from __future__ import print_function, division, unicode_literals, absolute_import
 
 from math import log
 from collections import defaultdict, Counter, namedtuple
 from functools import reduce
 import re
-import sys
 
-from nltk.lm import MLE
-from nltk.lm.preprocessing import padded_everygram_pipeline
+from six import text_type
+
 from nltk.probability import FreqDist
 from nltk.probability import ConditionalFreqDist as CFD
 from nltk.util import tokenwrap, LazyConcatenation
 from nltk.metrics import f_measure, BigramAssocMeasures
 from nltk.collocations import BigramCollocationFinder
-from nltk.tokenize import sent_tokenize
+from nltk.compat import python_2_unicode_compatible
 
 ConcordanceLine = namedtuple(
-    "ConcordanceLine",
-    ["left", "query", "right", "offset", "left_print", "right_print", "line"],
+    'ConcordanceLine',
+    ['left', 'query', 'right', 'offset', 'left_print', 'right_print', 'line'],
 )
 
 
@@ -46,8 +46,8 @@ class ContextIndex(object):
     @staticmethod
     def _default_context(tokens, i):
         """One left token and one right token, normalized to lowercase"""
-        left = tokens[i - 1].lower() if i != 0 else "*START*"
-        right = tokens[i + 1].lower() if i != len(tokens) - 1 else "*END*"
+        left = tokens[i - 1].lower() if i != 0 else '*START*'
+        right = tokens[i + 1].lower() if i != len(tokens) - 1 else '*END*'
         return (left, right)
 
     def __init__(self, tokens, context_func=None, filter=None, key=lambda x: x):
@@ -126,7 +126,7 @@ class ContextIndex(object):
             return fd
 
 
-
+@python_2_unicode_compatible
 class ConcordanceIndex(object):
     """
     An index that can be used to look up the offset locations at which
@@ -178,7 +178,7 @@ class ConcordanceIndex(object):
         return self._offsets[word]
 
     def __repr__(self):
-        return "<ConcordanceIndex for %d tokens (%d types)>" % (
+        return '<ConcordanceIndex for %d tokens (%d types)>' % (
             len(self._tokens),
             len(self._offsets),
         )
@@ -200,10 +200,10 @@ class ConcordanceIndex(object):
                 left_context = self._tokens[max(0, i - context) : i]
                 right_context = self._tokens[i + 1 : i + context]
                 # Create the pretty lines with the query_word in the middle.
-                left_print = " ".join(left_context)[-half_width:]
-                right_print = " ".join(right_context)[:half_width]
+                left_print = ' '.join(left_context)[-half_width:]
+                right_print = ' '.join(right_context)[:half_width]
                 # The WYSIWYG line of the concordance.
-                line_print = " ".join([left_print, query_word, right_print])
+                line_print = ' '.join([left_print, query_word, right_print])
                 # Create the ConcordanceLine
                 concordance_line = ConcordanceLine(
                     left_context,
@@ -252,7 +252,7 @@ class TokenSearcher(object):
     """
 
     def __init__(self, tokens):
-        self._raw = "".join("<" + w + ">" for w in tokens)
+        self._raw = ''.join('<' + w + '>' for w in tokens)
 
     def findall(self, regexp):
         """
@@ -279,25 +279,25 @@ class TokenSearcher(object):
         :type regexp: str
         """
         # preprocess the regular expression
-        regexp = re.sub(r"\s", "", regexp)
-        regexp = re.sub(r"<", "(?:<(?:", regexp)
-        regexp = re.sub(r">", ")>)", regexp)
-        regexp = re.sub(r"(?<!\\)\.", "[^>]", regexp)
+        regexp = re.sub(r'\s', '', regexp)
+        regexp = re.sub(r'<', '(?:<(?:', regexp)
+        regexp = re.sub(r'>', ')>)', regexp)
+        regexp = re.sub(r'(?<!\\)\.', '[^>]', regexp)
 
         # perform the search
         hits = re.findall(regexp, self._raw)
 
         # Sanity check
         for h in hits:
-            if not h.startswith("<") and h.endswith(">"):
-                raise ValueError("Bad regexp for TokenSearcher.findall")
+            if not h.startswith('<') and h.endswith('>'):
+                raise ValueError('Bad regexp for TokenSearcher.findall')
 
         # postprocess the output
-        hits = [h[1:-1].split("><") for h in hits]
+        hits = [h[1:-1].split('><') for h in hits]
         return hits
 
 
-
+@python_2_unicode_compatible
 class Text(object):
     """
     A wrapper around a sequence of simple (string) tokens, which is
@@ -337,11 +337,11 @@ class Text(object):
 
         if name:
             self.name = name
-        elif "]" in tokens[:20]:
-            end = tokens[:20].index("]")
-            self.name = " ".join(str(tok) for tok in tokens[1:end])
+        elif ']' in tokens[:20]:
+            end = tokens[:20].index(']')
+            self.name = " ".join(text_type(tok) for tok in tokens[1:end])
         else:
-            self.name = " ".join(str(tok) for tok in tokens[:8]) + "..."
+            self.name = " ".join(text_type(tok) for tok in tokens[:8]) + "..."
 
     # ////////////////////////////////////////////////////////////
     # Support item & slice access
@@ -371,7 +371,7 @@ class Text(object):
 
         :seealso: ``ConcordanceIndex``
         """
-        if "_concordance_index" not in self.__dict__:
+        if '_concordance_index' not in self.__dict__:
             self._concordance_index = ConcordanceIndex(
                 self.tokens, key=lambda s: s.lower()
             )
@@ -392,28 +392,24 @@ class Text(object):
 
         :seealso: ``ConcordanceIndex``
         """
-        if "_concordance_index" not in self.__dict__:
+        if '_concordance_index' not in self.__dict__:
             self._concordance_index = ConcordanceIndex(
                 self.tokens, key=lambda s: s.lower()
             )
         return self._concordance_index.find_concordance(word, width)[:lines]
 
-    def collocation_list(self, num=20, window_size=2):
+    def collocations(self, num=20, window_size=2):
         """
-        Return collocations derived from the text, ignoring stopwords.
-        
-            >>> from nltk.book import text4
-            >>> text4.collocation_list()[:2]
-            [('United', 'States'), ('fellow', 'citizens')]
+        Print collocations derived from the text, ignoring stopwords.
 
-        :param num: The maximum number of collocations to return.
+        :seealso: find_collocations
+        :param num: The maximum number of collocations to print.
         :type num: int
         :param window_size: The number of tokens spanned by a collocation (default=2)
         :type window_size: int
-        :rtype: list(tuple(str, str))
         """
         if not (
-            "_collocations" in self.__dict__
+            '_collocations' in self.__dict__
             and self._num == num
             and self._window_size == window_size
         ):
@@ -423,32 +419,14 @@ class Text(object):
             # print("Building collocations list")
             from nltk.corpus import stopwords
 
-            ignored_words = stopwords.words("english")
+            ignored_words = stopwords.words('english')
             finder = BigramCollocationFinder.from_words(self.tokens, window_size)
             finder.apply_freq_filter(2)
             finder.apply_word_filter(lambda w: len(w) < 3 or w.lower() in ignored_words)
             bigram_measures = BigramAssocMeasures()
-            self._collocations = list(finder.nbest(bigram_measures.likelihood_ratio, num))
-        return self._collocations
-
-    def collocations(self, num=20, window_size=2):
-        """
-        Print collocations derived from the text, ignoring stopwords.
-        
-            >>> from nltk.book import text4
-            >>> text4.collocations() # doctest: +ELLIPSIS
-            United States; fellow citizens; four years; ...
-
-        :param num: The maximum number of collocations to print.
-        :type num: int
-        :param window_size: The number of tokens spanned by a collocation (default=2)
-        :type window_size: int
-        """
-
-        collocation_strings = [
-            w1 + " " + w2 for w1, w2 in self.collocation_list(num, window_size)
-        ]
-        print(tokenwrap(collocation_strings, separator="; "))
+            self._collocations = finder.nbest(bigram_measures.likelihood_ratio, num)
+        colloc_strings = [w1 + ' ' + w2 for w1, w2 in self._collocations]
+        print(tokenwrap(colloc_strings, separator="; "))
 
     def count(self, word):
         """
@@ -477,7 +455,7 @@ class Text(object):
         :type num: int
         :seealso: ContextIndex.similar_words()
         """
-        if "_word_context_index" not in self.__dict__:
+        if '_word_context_index' not in self.__dict__:
             # print('Building word-context index...')
             self._word_context_index = ContextIndex(
                 self.tokens, filter=lambda x: x.isalpha(), key=lambda s: s.lower()
@@ -505,13 +483,13 @@ class Text(object):
         Find contexts where the specified words appear; list
         most frequent common contexts first.
 
-        :param words: The words used to seed the similarity search
-        :type words: str
+        :param word: The word used to seed the similarity search
+        :type word: str
         :param num: The number of words to generate (default=20)
         :type num: int
         :seealso: ContextIndex.common_contexts()
         """
-        if "_word_context_index" not in self.__dict__:
+        if '_word_context_index' not in self.__dict__:
             # print('Building word-context index...')
             self._word_context_index = ContextIndex(
                 self.tokens, key=lambda s: s.lower()
@@ -541,58 +519,15 @@ class Text(object):
 
         dispersion_plot(self, words)
 
-    def _train_default_ngram_lm(self, tokenized_sents, n=3):
-        train_data, padded_sents = padded_everygram_pipeline(n, tokenized_sents)
-        model = MLE(order=n)
-        model.fit(train_data, padded_sents)
-        return model
-
-    def generate(self, length=100, text_seed=None, random_seed=42):
+    def generate(self, words):
         """
-        Print random text, generated using a trigram language model.
-        See also `help(nltk.lm)`.
-
-        :param length: The length of text to generate (default=100)
-        :type length: int
-
-        :param text_seed: Generation can be conditioned on preceding context.
-        :type text_seed: list(str)
-
-        :param random_seed: A random seed or an instance of `random.Random`. If provided,
-        makes the random sampling part of generation reproducible. (default=42)
-        :type random_seed: int
-
+        Issues a reminder to users following the book online
         """
-        # Create the model when using it the first time.
-        self._tokenized_sents = [
-            sent.split(" ") for sent in sent_tokenize(" ".join(self.tokens))
-        ]
-        if not hasattr(self, "trigram_model"):
-            print("Building ngram index...", file=sys.stderr)
-            self._trigram_model = self._train_default_ngram_lm(
-                self._tokenized_sents, n=3
-            )
+        import warnings
 
-        generated_tokens = []
-
-        assert length > 0, "The `length` must be more than 0."
-        while len(generated_tokens) < length:
-            for idx, token in enumerate(
-                self._trigram_model.generate(
-                    length, text_seed=text_seed, random_seed=random_seed
-                )
-            ):
-                if token == "<s>":
-                    continue
-                if token == "</s>":
-                    break
-                generated_tokens.append(token)
-            random_seed += 1
-
-        prefix = " ".join(text_seed) + " " if text_seed else ""
-        output_str = prefix + tokenwrap(generated_tokens[:length])
-        print(output_str)
-        return output_str
+        warnings.warn(
+            'The generate() method is no longer available.', DeprecationWarning
+        )
 
     def plot(self, *args):
         """
@@ -638,14 +573,14 @@ class Text(object):
             self._token_searcher = TokenSearcher(self)
 
         hits = self._token_searcher.findall(regexp)
-        hits = [" ".join(h) for h in hits]
+        hits = [' '.join(h) for h in hits]
         print(tokenwrap(hits, "; "))
 
     # ////////////////////////////////////////////////////////////
     # Helper Methods
     # ////////////////////////////////////////////////////////////
 
-    _CONTEXT_RE = re.compile("\w+|[\.\!\?]")
+    _CONTEXT_RE = re.compile('\w+|[\.\!\?]')
 
     def _context(self, tokens, i):
         """
@@ -657,13 +592,13 @@ class Text(object):
         j = i - 1
         while j >= 0 and not self._CONTEXT_RE.match(tokens[j]):
             j -= 1
-        left = tokens[j] if j != 0 else "*START*"
+        left = tokens[j] if j != 0 else '*START*'
 
         # Right context
         j = i + 1
         while j < len(tokens) and not self._CONTEXT_RE.match(tokens[j]):
             j += 1
-        right = tokens[j] if j != len(tokens) else "*END*"
+        right = tokens[j] if j != len(tokens) else '*END*'
 
         return (left, right)
 
@@ -672,10 +607,10 @@ class Text(object):
     # ////////////////////////////////////////////////////////////
 
     def __str__(self):
-        return "<Text: %s>" % self.name
+        return '<Text: %s>' % self.name
 
     def __repr__(self):
-        return "<Text: %s>" % self.name
+        return '<Text: %s>' % self.name
 
 
 # Prototype only; this approach will be slow to load
@@ -697,7 +632,7 @@ class TextCollection(Text):
     """
 
     def __init__(self, source):
-        if hasattr(source, "words"):  # bridge to the text corpus reader
+        if hasattr(source, 'words'):  # bridge to the text corpus reader
             source = [source.words(f) for f in source.fileids()]
 
         self._texts = source
@@ -717,7 +652,7 @@ class TextCollection(Text):
         if idf is None:
             matches = len([True for text in self._texts if term in text])
             if len(self._texts) == 0:
-                raise ValueError("IDF undefined for empty document collection")
+                raise ValueError('IDF undefined for empty document collection')
             idf = log(len(self._texts) / matches) if matches else 0.0
             self._idf_cache[term] = idf
         return idf
@@ -729,14 +664,14 @@ class TextCollection(Text):
 def demo():
     from nltk.corpus import brown
 
-    text = Text(brown.words(categories="news"))
+    text = Text(brown.words(categories='news'))
     print(text)
     print()
     print("Concordance:")
-    text.concordance("news")
+    text.concordance('news')
     print()
     print("Distributionally similar words:")
-    text.similar("news")
+    text.similar('news')
     print()
     print("Collocations:")
     text.collocations()
@@ -745,7 +680,7 @@ def demo():
     # text.generate()
     # print()
     print("Dispersion plot:")
-    text.dispersion_plot(["news", "report", "said", "announced"])
+    text.dispersion_plot(['news', 'report', 'said', 'announced'])
     print()
     print("Vocabulary plot:")
     text.plot(50)
@@ -753,10 +688,10 @@ def demo():
     print("Indexing:")
     print("text[3]:", text[3])
     print("text[3:5]:", text[3:5])
-    print("text.vocab()['news']:", text.vocab()["news"])
+    print("text.vocab()['news']:", text.vocab()['news'])
 
 
-if __name__ == "__main__":
+if __name__ == '__main__':
     demo()
 
 __all__ = [
index 84df549..d5a315a 100644 (file)
@@ -3,12 +3,12 @@
 #
 # Natural Language Toolkit: TGrep search
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Will Roberts <wildwilhelm@gmail.com>
 # URL: <http://nltk.org/>
 # For license information, see LICENSE.TXT
 
-"""
+'''
 ============================================
  TGrep search implementation for NLTK trees
 ============================================
@@ -108,32 +108,36 @@ specified in a call to a predicate.  Predicates which call other
 predicates must always pass the value of these arguments on.  The
 top-level predicate (constructed by ``_tgrep_exprs_action``) binds the
 macro definitions to ``m`` and initialises ``l`` to an empty dictionary.
-"""
+'''
+
+from __future__ import absolute_import, print_function, unicode_literals
 
 import functools
 import re
 
+from six import binary_type, text_type
+
 try:
     import pyparsing
 except ImportError:
-    print("Warning: nltk.tgrep will not work without the `pyparsing` package")
-    print("installed.")
+    print('Warning: nltk.tgrep will not work without the `pyparsing` package')
+    print('installed.')
 
 import nltk.tree
 
 
 class TgrepException(Exception):
-    """Tgrep exception type."""
+    '''Tgrep exception type.'''
 
     pass
 
 
 def ancestors(node):
-    """
+    '''
     Returns the list of all nodes dominating the given tree node.
     This method will not work with leaf nodes, since there is no way
     to recover the parent.
-    """
+    '''
     results = []
     try:
         current = node.parent()
@@ -147,10 +151,10 @@ def ancestors(node):
 
 
 def unique_ancestors(node):
-    """
+    '''
     Returns the list of all nodes dominating the given node, where
     there is only a single path of descent.
-    """
+    '''
     results = []
     try:
         current = node.parent()
@@ -164,10 +168,10 @@ def unique_ancestors(node):
 
 
 def _descendants(node):
-    """
+    '''
     Returns the list of all nodes which are descended from the given
     tree node in some way.
-    """
+    '''
     try:
         treepos = node.treepositions()
     except AttributeError:
@@ -176,10 +180,10 @@ def _descendants(node):
 
 
 def _leftmost_descendants(node):
-    """
+    '''
     Returns the set of all nodes descended in some way through
     left branches from this node.
-    """
+    '''
     try:
         treepos = node.treepositions()
     except AttributeError:
@@ -188,10 +192,10 @@ def _leftmost_descendants(node):
 
 
 def _rightmost_descendants(node):
-    """
+    '''
     Returns the set of all nodes descended in some way through
     right branches from this node.
-    """
+    '''
     try:
         rightmost_leaf = max(node.treepositions())
     except AttributeError:
@@ -200,15 +204,15 @@ def _rightmost_descendants(node):
 
 
 def _istree(obj):
-    """Predicate to check whether `obj` is a nltk.tree.Tree."""
+    '''Predicate to check whether `obj` is a nltk.tree.Tree.'''
     return isinstance(obj, nltk.tree.Tree)
 
 
 def _unique_descendants(node):
-    """
+    '''
     Returns the list of all nodes descended from the given node, where
     there is only a single path of descent.
-    """
+    '''
     results = []
     current = node
     while current and _istree(current) and len(current) == 1:
@@ -218,9 +222,9 @@ def _unique_descendants(node):
 
 
 def _before(node):
-    """
+    '''
     Returns the set of all nodes that are before the given node.
-    """
+    '''
     try:
         pos = node.treeposition()
         tree = node.root()
@@ -230,14 +234,14 @@ def _before(node):
 
 
 def _immediately_before(node):
-    """
+    '''
     Returns the set of all nodes that are immediately before the given
     node.
 
     Tree node A immediately precedes node B if the last terminal
     symbol (word) produced by A immediately precedes the first
     terminal symbol produced by B.
-    """
+    '''
     try:
         pos = node.treeposition()
         tree = node.root()
@@ -256,9 +260,9 @@ def _immediately_before(node):
 
 
 def _after(node):
-    """
+    '''
     Returns the set of all nodes that are after the given node.
-    """
+    '''
     try:
         pos = node.treeposition()
         tree = node.root()
@@ -268,14 +272,14 @@ def _after(node):
 
 
 def _immediately_after(node):
-    """
+    '''
     Returns the set of all nodes that are immediately after the given
     node.
 
     Tree node A immediately follows node B if the first terminal
     symbol (word) produced by A immediately follows the last
     terminal symbol produced by B.
-    """
+    '''
     try:
         pos = node.treeposition()
         tree = node.root()
@@ -297,66 +301,67 @@ def _immediately_after(node):
 
 
 def _tgrep_node_literal_value(node):
-    """
+    '''
     Gets the string value of a given parse tree node, for comparison
     using the tgrep node literal predicates.
-    """
-    return node.label() if _istree(node) else str(node)
+    '''
+    return node.label() if _istree(node) else text_type(node)
 
 
 def _tgrep_macro_use_action(_s, _l, tokens):
-    """
+    '''
     Builds a lambda function which looks up the macro name used.
-    """
+    '''
     assert len(tokens) == 1
-    assert tokens[0][0] == "@"
+    assert tokens[0][0] == '@'
     macro_name = tokens[0][1:]
 
     def macro_use(n, m=None, l=None):
         if m is None or macro_name not in m:
-            raise TgrepException("macro {0} not defined".format(macro_name))
+            raise TgrepException('macro {0} not defined'.format(macro_name))
         return m[macro_name](n, m, l)
 
     return macro_use
 
 
 def _tgrep_node_action(_s, _l, tokens):
-    """
+    '''
     Builds a lambda function representing a predicate on a tree node
     depending on the name of its node.
-    """
+    '''
+    # print 'node tokens: ', tokens
     if tokens[0] == "'":
         # strip initial apostrophe (tgrep2 print command)
         tokens = tokens[1:]
     if len(tokens) > 1:
         # disjunctive definition of a node name
-        assert list(set(tokens[1::2])) == ["|"]
+        assert list(set(tokens[1::2])) == ['|']
         # recursively call self to interpret each node name definition
         tokens = [_tgrep_node_action(None, None, [node]) for node in tokens[::2]]
         # capture tokens and return the disjunction
         return (lambda t: lambda n, m=None, l=None: any(f(n, m, l) for f in t))(tokens)
     else:
-        if hasattr(tokens[0], "__call__"):
+        if hasattr(tokens[0], '__call__'):
             # this is a previously interpreted parenthetical node
             # definition (lambda function)
             return tokens[0]
-        elif tokens[0] == "*" or tokens[0] == "__":
+        elif tokens[0] == '*' or tokens[0] == '__':
             return lambda n, m=None, l=None: True
         elif tokens[0].startswith('"'):
             assert tokens[0].endswith('"')
-            node_lit = tokens[0][1:-1].replace('\\"', '"').replace("\\\\", "\\")
+            node_lit = tokens[0][1:-1].replace('\\"', '"').replace('\\\\', '\\')
             return (
                 lambda s: lambda n, m=None, l=None: _tgrep_node_literal_value(n) == s
             )(node_lit)
-        elif tokens[0].startswith("/"):
-            assert tokens[0].endswith("/")
+        elif tokens[0].startswith('/'):
+            assert tokens[0].endswith('/')
             node_lit = tokens[0][1:-1]
             return (
                 lambda r: lambda n, m=None, l=None: r.search(
                     _tgrep_node_literal_value(n)
                 )
             )(re.compile(node_lit))
-        elif tokens[0].startswith("i@"):
+        elif tokens[0].startswith('i@'):
             node_func = _tgrep_node_action(_s, _l, [tokens[0][2:].lower()])
             return (
                 lambda f: lambda n, m=None, l=None: f(
@@ -370,78 +375,80 @@ def _tgrep_node_action(_s, _l, tokens):
 
 
 def _tgrep_parens_action(_s, _l, tokens):
-    """
+    '''
     Builds a lambda function representing a predicate on a tree node
     from a parenthetical notation.
-    """
+    '''
+    # print 'parenthetical tokens: ', tokens
     assert len(tokens) == 3
-    assert tokens[0] == "("
-    assert tokens[2] == ")"
+    assert tokens[0] == '('
+    assert tokens[2] == ')'
     return tokens[1]
 
 
 def _tgrep_nltk_tree_pos_action(_s, _l, tokens):
-    """
+    '''
     Builds a lambda function representing a predicate on a tree node
     which returns true if the node is located at a specific tree
     position.
-    """
+    '''
     # recover the tuple from the parsed sting
     node_tree_position = tuple(int(x) for x in tokens if x.isdigit())
     # capture the node's tree position
     return (
         lambda i: lambda n, m=None, l=None: (
-            hasattr(n, "treeposition") and n.treeposition() == i
+            hasattr(n, 'treeposition') and n.treeposition() == i
         )
     )(node_tree_position)
 
 
 def _tgrep_relation_action(_s, _l, tokens):
-    """
+    '''
     Builds a lambda function representing a predicate on a tree node
     depending on its relation to other nodes in the tree.
-    """
+    '''
+    # print 'relation tokens: ', tokens
     # process negation first if needed
     negated = False
-    if tokens[0] == "!":
+    if tokens[0] == '!':
         negated = True
         tokens = tokens[1:]
-    if tokens[0] == "[":
+    if tokens[0] == '[':
         # process square-bracketed relation expressions
         assert len(tokens) == 3
-        assert tokens[2] == "]"
+        assert tokens[2] == ']'
         retval = tokens[1]
     else:
         # process operator-node relation expressions
         assert len(tokens) == 2
         operator, predicate = tokens
         # A < B       A is the parent of (immediately dominates) B.
-        if operator == "<":
+        if operator == '<':
             retval = lambda n, m=None, l=None: (
                 _istree(n) and any(predicate(x, m, l) for x in n)
             )
         # A > B       A is the child of B.
-        elif operator == ">":
+        elif operator == '>':
             retval = lambda n, m=None, l=None: (
-                hasattr(n, "parent")
+                hasattr(n, 'parent')
                 and bool(n.parent())
                 and predicate(n.parent(), m, l)
             )
         # A <, B      Synonymous with A <1 B.
-        elif operator == "<," or operator == "<1":
+        elif operator == '<,' or operator == '<1':
             retval = lambda n, m=None, l=None: (
                 _istree(n) and bool(list(n)) and predicate(n[0], m, l)
             )
         # A >, B      Synonymous with A >1 B.
-        elif operator == ">," or operator == ">1":
+        elif operator == '>,' or operator == '>1':
             retval = lambda n, m=None, l=None: (
-                hasattr(n, "parent")
+                hasattr(n, 'parent')
                 and bool(n.parent())
                 and (n is n.parent()[0])
                 and predicate(n.parent(), m, l)
             )
         # A <N B      B is the Nth child of A (the first child is <1).
-        elif operator[0] == "<" and operator[1:].isdigit():
+        elif operator[0] == '<' and operator[1:].isdigit():
             idx = int(operator[1:])
             # capture the index parameter
             retval = (
@@ -453,12 +460,12 @@ def _tgrep_relation_action(_s, _l, tokens):
                 )
             )(idx - 1)
         # A >N B      A is the Nth child of B (the first child is >1).
-        elif operator[0] == ">" and operator[1:].isdigit():
+        elif operator[0] == '>' and operator[1:].isdigit():
             idx = int(operator[1:])
             # capture the index parameter
             retval = (
                 lambda i: lambda n, m=None, l=None: (
-                    hasattr(n, "parent")
+                    hasattr(n, 'parent')
                     and bool(n.parent())
                     and 0 <= i < len(n.parent())
                     and (n is n.parent()[i])
@@ -467,21 +474,21 @@ def _tgrep_relation_action(_s, _l, tokens):
             )(idx - 1)
         # A <' B      B is the last child of A (also synonymous with A <-1 B).
         # A <- B      B is the last child of A (synonymous with A <-1 B).
-        elif operator == "<'" or operator == "<-" or operator == "<-1":
+        elif operator == '<\'' or operator == '<-' or operator == '<-1':
             retval = lambda n, m=None, l=None: (
                 _istree(n) and bool(list(n)) and predicate(n[-1], m, l)
             )
         # A >' B      A is the last child of B (also synonymous with A >-1 B).
         # A >- B      A is the last child of B (synonymous with A >-1 B).
-        elif operator == ">'" or operator == ">-" or operator == ">-1":
+        elif operator == '>\'' or operator == '>-' or operator == '>-1':
             retval = lambda n, m=None, l=None: (
-                hasattr(n, "parent")
+                hasattr(n, 'parent')
                 and bool(n.parent())
                 and (n is n.parent()[-1])
                 and predicate(n.parent(), m, l)
             )
         # A <-N B        B is the N th-to-last child of A (the last child is <-1).
-        elif operator[:2] == "<-" and operator[2:].isdigit():
+        elif operator[:2] == '<-' and operator[2:].isdigit():
             idx = -int(operator[2:])
             # capture the index parameter
             retval = (
@@ -493,12 +500,12 @@ def _tgrep_relation_action(_s, _l, tokens):
                 )
             )(idx)
         # A >-N B        A is the N th-to-last child of B (the last child is >-1).
-        elif operator[:2] == ">-" and operator[2:].isdigit():
+        elif operator[:2] == '>-' and operator[2:].isdigit():
             idx = -int(operator[2:])
             # capture the index parameter
             retval = (
                 lambda i: lambda n, m=None, l=None: (
-                    hasattr(n, "parent")
+                    hasattr(n, 'parent')
                     and bool(n.parent())
                     and 0 <= (i + len(n.parent())) < len(n.parent())
                     and (n is n.parent()[i + len(n.parent())])
@@ -506,115 +513,115 @@ def _tgrep_relation_action(_s, _l, tokens):
                 )
             )(idx)
         # A <: B      B is the only child of A
-        elif operator == "<:":
+        elif operator == '<:':
             retval = lambda n, m=None, l=None: (
                 _istree(n) and len(n) == 1 and predicate(n[0], m, l)
             )
         # A >: B      A is the only child of B.
-        elif operator == ">:":
+        elif operator == '>:':
             retval = lambda n, m=None, l=None: (
-                hasattr(n, "parent")
+                hasattr(n, 'parent')
                 and bool(n.parent())
                 and len(n.parent()) == 1
                 and predicate(n.parent(), m, l)
             )
         # A << B      A dominates B (A is an ancestor of B).
-        elif operator == "<<":
+        elif operator == '<<':
             retval = lambda n, m=None, l=None: (
                 _istree(n) and any(predicate(x, m, l) for x in _descendants(n))
             )
         # A >> B      A is dominated by B (A is a descendant of B).
-        elif operator == ">>":
+        elif operator == '>>':
             retval = lambda n, m=None, l=None: any(
                 predicate(x, m, l) for x in ancestors(n)
             )
         # A <<, B     B is a left-most descendant of A.
-        elif operator == "<<," or operator == "<<1":
+        elif operator == '<<,' or operator == '<<1':
             retval = lambda n, m=None, l=None: (
                 _istree(n) and any(predicate(x, m, l) for x in _leftmost_descendants(n))
             )
         # A >>, B     A is a left-most descendant of B.
-        elif operator == ">>,":
+        elif operator == '>>,':
             retval = lambda n, m=None, l=None: any(
                 (predicate(x, m, l) and n in _leftmost_descendants(x))
                 for x in ancestors(n)
             )
         # A <<' B     B is a right-most descendant of A.
-        elif operator == "<<'":
+        elif operator == '<<\'':
             retval = lambda n, m=None, l=None: (
                 _istree(n)
                 and any(predicate(x, m, l) for x in _rightmost_descendants(n))
             )
         # A >>' B     A is a right-most descendant of B.
-        elif operator == ">>'":
+        elif operator == '>>\'':
             retval = lambda n, m=None, l=None: any(
                 (predicate(x, m, l) and n in _rightmost_descendants(x))
                 for x in ancestors(n)
             )
         # A <<: B     There is a single path of descent from A and B is on it.
-        elif operator == "<<:":
+        elif operator == '<<:':
             retval = lambda n, m=None, l=None: (
                 _istree(n) and any(predicate(x, m, l) for x in _unique_descendants(n))
             )
         # A >>: B     There is a single path of descent from B and A is on it.
-        elif operator == ">>:":
+        elif operator == '>>:':
             retval = lambda n, m=None, l=None: any(
                 predicate(x, m, l) for x in unique_ancestors(n)
             )
         # A . B       A immediately precedes B.
-        elif operator == ".":
+        elif operator == '.':
             retval = lambda n, m=None, l=None: any(
                 predicate(x, m, l) for x in _immediately_after(n)
             )
         # A , B       A immediately follows B.
-        elif operator == ",":
+        elif operator == ',':
             retval = lambda n, m=None, l=None: any(
                 predicate(x, m, l) for x in _immediately_before(n)
             )
         # A .. B      A precedes B.
-        elif operator == "..":
+        elif operator == '..':
             retval = lambda n, m=None, l=None: any(
                 predicate(x, m, l) for x in _after(n)
             )
         # A ,, B      A follows B.
-        elif operator == ",,":
+        elif operator == ',,':
             retval = lambda n, m=None, l=None: any(
                 predicate(x, m, l) for x in _before(n)
             )
         # A $ B       A is a sister of B (and A != B).
-        elif operator == "$" or operator == "%":
+        elif operator == '$' or operator == '%':
             retval = lambda n, m=None, l=None: (
-                hasattr(n, "parent")
+                hasattr(n, 'parent')
                 and bool(n.parent())
                 and any(predicate(x, m, l) for x in n.parent() if x is not n)
             )
         # A $. B      A is a sister of and immediately precedes B.
-        elif operator == "$." or operator == "%.":
+        elif operator == '$.' or operator == '%.':
             retval = lambda n, m=None, l=None: (
-                hasattr(n, "right_sibling")
+                hasattr(n, 'right_sibling')
                 and bool(n.right_sibling())
                 and predicate(n.right_sibling(), m, l)
             )
         # A $, B      A is a sister of and immediately follows B.
-        elif operator == "$," or operator == "%,":
+        elif operator == '$,' or operator == '%,':
             retval = lambda n, m=None, l=None: (
-                hasattr(n, "left_sibling")
+                hasattr(n, 'left_sibling')
                 and bool(n.left_sibling())
                 and predicate(n.left_sibling(), m, l)
             )
         # A $.. B     A is a sister of and precedes B.
-        elif operator == "$.." or operator == "%..":
+        elif operator == '$..' or operator == '%..':
             retval = lambda n, m=None, l=None: (
-                hasattr(n, "parent")
-                and hasattr(n, "parent_index")
+                hasattr(n, 'parent')
+                and hasattr(n, 'parent_index')
                 and bool(n.parent())
                 and any(predicate(x, m, l) for x in n.parent()[n.parent_index() + 1 :])
             )
         # A $,, B     A is a sister of and follows B.
-        elif operator == "$,," or operator == "%,,":
+        elif operator == '$,,' or operator == '%,,':
             retval = lambda n, m=None, l=None: (
-                hasattr(n, "parent")
-                and hasattr(n, "parent_index")
+                hasattr(n, 'parent')
+                and hasattr(n, 'parent_index')
                 and bool(n.parent())
                 and any(predicate(x, m, l) for x in n.parent()[: n.parent_index()])
             )
@@ -629,8 +636,8 @@ def _tgrep_relation_action(_s, _l, tokens):
         return retval
 
 
-def _tgrep_conjunction_action(_s, _l, tokens, join_char="&"):
-    """
+def _tgrep_conjunction_action(_s, _l, tokens, join_char='&'):
+    '''
     Builds a lambda function representing a predicate on a tree node
     from the conjunction of several other such lambda functions.
 
@@ -651,9 +658,10 @@ def _tgrep_conjunction_action(_s, _l, tokens, join_char="&"):
     tokens[0] is a tgrep_expr predicate; tokens[1:] are an (optional)
     list of segmented patterns (`tgrep_expr_labeled`, processed by
     `_tgrep_segmented_pattern_action`).
-    """
+    '''
     # filter out the ampersand
     tokens = [x for x in tokens if x != join_char]
+    # print 'relation conjunction tokens: ', tokens
     if len(tokens) == 1:
         return tokens[0]
     else:
@@ -665,7 +673,7 @@ def _tgrep_conjunction_action(_s, _l, tokens, join_char="&"):
 
 
 def _tgrep_segmented_pattern_action(_s, _l, tokens):
-    """
+    '''
     Builds a lambda function representing a segmented pattern.
 
     Called for expressions like (`tgrep_expr_labeled`)::
@@ -687,7 +695,7 @@ def _tgrep_segmented_pattern_action(_s, _l, tokens):
     parse action to the pred use inside a node_expr.  See
     `_tgrep_node_label_use_action` and
     `_tgrep_node_label_pred_use_action`.
-    """
+    '''
     # tokens[0] is a string containing the node label
     node_label = tokens[0]
     # tokens[1:] is an (optional) list of predicates which must all
@@ -695,11 +703,11 @@ def _tgrep_segmented_pattern_action(_s, _l, tokens):
     reln_preds = tokens[1:]
 
     def pattern_segment_pred(n, m=None, l=None):
-        """This predicate function ignores its node argument."""
+        '''This predicate function ignores its node argument.'''
         # look up the bound node using its label
         if l is None or node_label not in l:
             raise TgrepException(
-                "node_label ={0} not bound in pattern".format(node_label)
+                'node_label ={0} not bound in pattern'.format(node_label)
             )
         node = l[node_label]
         # match the relation predicates against the node
@@ -709,7 +717,7 @@ def _tgrep_segmented_pattern_action(_s, _l, tokens):
 
 
 def _tgrep_node_label_use_action(_s, _l, tokens):
-    """
+    '''
     Returns the node label used to begin a tgrep_expr_labeled.  See
     `_tgrep_segmented_pattern_action`.
 
@@ -721,14 +729,14 @@ def _tgrep_node_label_use_action(_s, _l, tokens):
     expression (see `_tgrep_segmented_pattern_action`).
 
     It returns the node label.
-    """
+    '''
     assert len(tokens) == 1
-    assert tokens[0].startswith("=")
+    assert tokens[0].startswith('=')
     return tokens[0][1:]
 
 
 def _tgrep_node_label_pred_use_action(_s, _l, tokens):
-    """
+    '''
     Builds a lambda function representing a predicate on a tree node
     which describes the use of a previously bound node label.
 
@@ -740,16 +748,16 @@ def _tgrep_node_label_pred_use_action(_s, _l, tokens):
     relation).  The predicate returns true if and only if its node
     argument is identical the the node looked up in the node label
     dictionary using the node's label.
-    """
+    '''
     assert len(tokens) == 1
-    assert tokens[0].startswith("=")
+    assert tokens[0].startswith('=')
     node_label = tokens[0][1:]
 
     def node_label_use_pred(n, m=None, l=None):
         # look up the bound node using its label
         if l is None or node_label not in l:
             raise TgrepException(
-                "node_label ={0} not bound in pattern".format(node_label)
+                'node_label ={0} not bound in pattern'.format(node_label)
             )
         node = l[node_label]
         # truth means the given node is this node
@@ -759,7 +767,7 @@ def _tgrep_node_label_pred_use_action(_s, _l, tokens):
 
 
 def _tgrep_bind_node_label_action(_s, _l, tokens):
-    """
+    '''
     Builds a lambda function representing a predicate on a tree node
     which can optionally bind a matching node into the tgrep2 string's
     label_dict.
@@ -768,7 +776,7 @@ def _tgrep_bind_node_label_action(_s, _l, tokens):
 
         /NP/
         @NP=n
-    """
+    '''
     # tokens[0] is a tgrep_node_expr
     if len(tokens) == 1:
         return tokens[0]
@@ -776,7 +784,7 @@ def _tgrep_bind_node_label_action(_s, _l, tokens):
         # if present, tokens[1] is the character '=', and tokens[2] is
         # a tgrep_node_label, a string value containing the node label
         assert len(tokens) == 3
-        assert tokens[1] == "="
+        assert tokens[1] == '='
         node_pred = tokens[0]
         node_label = tokens[2]
 
@@ -785,7 +793,7 @@ def _tgrep_bind_node_label_action(_s, _l, tokens):
                 # bind `n` into the dictionary `l`
                 if l is None:
                     raise TgrepException(
-                        "cannot bind node_label {0}: label_dict is None".format(
+                        'cannot bind node_label {0}: label_dict is None'.format(
                             node_label
                         )
                     )
@@ -798,12 +806,13 @@ def _tgrep_bind_node_label_action(_s, _l, tokens):
 
 
 def _tgrep_rel_disjunction_action(_s, _l, tokens):
-    """
+    '''
     Builds a lambda function representing a predicate on a tree node
     from the disjunction of several other such lambda functions.
-    """
+    '''
     # filter out the pipe
-    tokens = [x for x in tokens if x != "|"]
+    tokens = [x for x in tokens if x != '|']
+    # print 'relation disjunction tokens: ', tokens
     if len(tokens) == 1:
         return tokens[0]
     elif len(tokens) == 2:
@@ -813,16 +822,16 @@ def _tgrep_rel_disjunction_action(_s, _l, tokens):
 
 
 def _macro_defn_action(_s, _l, tokens):
-    """
+    '''
     Builds a dictionary structure which defines the given macro.
-    """
+    '''
     assert len(tokens) == 3
-    assert tokens[0] == "@"
+    assert tokens[0] == '@'
     return {tokens[1]: tokens[2]}
 
 
 def _tgrep_exprs_action(_s, _l, tokens):
-    """
+    '''
     This is the top-lebel node in a tgrep2 search string; the
     predicate function it returns binds together all the state of a
     tgrep2 search string.
@@ -831,11 +840,11 @@ def _tgrep_exprs_action(_s, _l, tokens):
     from the disjunction of several tgrep expressions.  Also handles
     macro definitions and macro name binding, and node label
     definitions and node label binding.
-    """
+    '''
     if len(tokens) == 1:
         return lambda n, m=None, l=None: tokens[0](n, None, {})
     # filter out all the semicolons
-    tokens = [x for x in tokens if x != ";"]
+    tokens = [x for x in tokens if x != ';']
     # collect all macro definitions
     macro_dict = {}
     macro_defs = [tok for tok in tokens if isinstance(tok, dict)]
@@ -853,42 +862,42 @@ def _tgrep_exprs_action(_s, _l, tokens):
 
 
 def _build_tgrep_parser(set_parse_actions=True):
-    """
+    '''
     Builds a pyparsing-based parser object for tokenizing and
     interpreting tgrep search strings.
-    """
-    tgrep_op = pyparsing.Optional("!") + pyparsing.Regex("[$%,.<>][%,.<>0-9-':]*")
+    '''
+    tgrep_op = pyparsing.Optional('!') + pyparsing.Regex('[$%,.<>][%,.<>0-9-\':]*')
     tgrep_qstring = pyparsing.QuotedString(
-        quoteChar='"', escChar="\\", unquoteResults=False
+        quoteChar='"', escChar='\\', unquoteResults=False
     )
     tgrep_node_regex = pyparsing.QuotedString(
-        quoteChar="/", escChar="\\", unquoteResults=False
+        quoteChar='/', escChar='\\', unquoteResults=False
     )
     tgrep_qstring_icase = pyparsing.Regex('i@\\"(?:[^"\\n\\r\\\\]|(?:\\\\.))*\\"')
-    tgrep_node_regex_icase = pyparsing.Regex("i@\\/(?:[^/\\n\\r\\\\]|(?:\\\\.))*\\/")
-    tgrep_node_literal = pyparsing.Regex("[^][ \r\t\n;:.,&|<>()$!@%'^=]+")
+    tgrep_node_regex_icase = pyparsing.Regex('i@\\/(?:[^/\\n\\r\\\\]|(?:\\\\.))*\\/')
+    tgrep_node_literal = pyparsing.Regex('[^][ \r\t\n;:.,&|<>()$!@%\'^=]+')
     tgrep_expr = pyparsing.Forward()
     tgrep_relations = pyparsing.Forward()
-    tgrep_parens = pyparsing.Literal("(") + tgrep_expr + ")"
+    tgrep_parens = pyparsing.Literal('(') + tgrep_expr + ')'
     tgrep_nltk_tree_pos = (
-        pyparsing.Literal("N(")
+        pyparsing.Literal('N(')
         + pyparsing.Optional(
             pyparsing.Word(pyparsing.nums)
-            + ","
+            + ','
             + pyparsing.Optional(
-                pyparsing.delimitedList(pyparsing.Word(pyparsing.nums), delim=",")
-                + pyparsing.Optional(",")
+                pyparsing.delimitedList(pyparsing.Word(pyparsing.nums), delim=',')
+                + pyparsing.Optional(',')
             )
         )
-        + ")"
+        + ')'
     )
-    tgrep_node_label = pyparsing.Regex("[A-Za-z0-9]+")
-    tgrep_node_label_use = pyparsing.Combine("=" + tgrep_node_label)
+    tgrep_node_label = pyparsing.Regex('[A-Za-z0-9]+')
+    tgrep_node_label_use = pyparsing.Combine('=' + tgrep_node_label)
     # see _tgrep_segmented_pattern_action
     tgrep_node_label_use_pred = tgrep_node_label_use.copy()
-    macro_name = pyparsing.Regex("[^];:.,&|<>()[$!@%'^=\r\t\n ]+")
-    macro_name.setWhitespaceChars("")
-    macro_use = pyparsing.Combine("@" + macro_name)
+    macro_name = pyparsing.Regex('[^];:.,&|<>()[$!@%\'^=\r\t\n ]+')
+    macro_name.setWhitespaceChars('')
+    macro_use = pyparsing.Combine('@' + macro_name)
     tgrep_node_expr = (
         tgrep_node_label_use_pred
         | macro_use
@@ -897,40 +906,40 @@ def _build_tgrep_parser(set_parse_actions=True):
         | tgrep_node_regex_icase
         | tgrep_qstring
         | tgrep_node_regex
-        | "*"
+        | '*'
         | tgrep_node_literal
     )
     tgrep_node_expr2 = (
         tgrep_node_expr
-        + pyparsing.Literal("=").setWhitespaceChars("")
-        + tgrep_node_label.copy().setWhitespaceChars("")
+        + pyparsing.Literal('=').setWhitespaceChars('')
+        + tgrep_node_label.copy().setWhitespaceChars('')
     ) | tgrep_node_expr
     tgrep_node = tgrep_parens | (
         pyparsing.Optional("'")
         + tgrep_node_expr2
         + pyparsing.ZeroOrMore("|" + tgrep_node_expr)
     )
-    tgrep_brackets = pyparsing.Optional("!") + "[" + tgrep_relations + "]"
+    tgrep_brackets = pyparsing.Optional('!') + '[' + tgrep_relations + ']'
     tgrep_relation = tgrep_brackets | (tgrep_op + tgrep_node)
     tgrep_rel_conjunction = pyparsing.Forward()
     tgrep_rel_conjunction << (
         tgrep_relation
-        + pyparsing.ZeroOrMore(pyparsing.Optional("&") + tgrep_rel_conjunction)
+        + pyparsing.ZeroOrMore(pyparsing.Optional('&') + tgrep_rel_conjunction)
     )
     tgrep_relations << tgrep_rel_conjunction + pyparsing.ZeroOrMore(
         "|" + tgrep_relations
     )
     tgrep_expr << tgrep_node + pyparsing.Optional(tgrep_relations)
     tgrep_expr_labeled = tgrep_node_label_use + pyparsing.Optional(tgrep_relations)
-    tgrep_expr2 = tgrep_expr + pyparsing.ZeroOrMore(":" + tgrep_expr_labeled)
+    tgrep_expr2 = tgrep_expr + pyparsing.ZeroOrMore(':' + tgrep_expr_labeled)
     macro_defn = (
-        pyparsing.Literal("@") + pyparsing.White().suppress() + macro_name + tgrep_expr2
+        pyparsing.Literal('@') + pyparsing.White().suppress() + macro_name + tgrep_expr2
     )
     tgrep_exprs = (
-        pyparsing.Optional(macro_defn + pyparsing.ZeroOrMore(";" + macro_defn) + ";")
+        pyparsing.Optional(macro_defn + pyparsing.ZeroOrMore(';' + macro_defn) + ';')
         + tgrep_expr2
-        + pyparsing.ZeroOrMore(";" + (macro_defn | tgrep_expr2))
-        + pyparsing.ZeroOrMore(";").suppress()
+        + pyparsing.ZeroOrMore(';' + (macro_defn | tgrep_expr2))
+        + pyparsing.ZeroOrMore(';').suppress()
     )
     if set_parse_actions:
         tgrep_node_label_use.setParseAction(_tgrep_node_label_use_action)
@@ -950,38 +959,38 @@ def _build_tgrep_parser(set_parse_actions=True):
         tgrep_expr.setParseAction(_tgrep_conjunction_action)
         tgrep_expr_labeled.setParseAction(_tgrep_segmented_pattern_action)
         tgrep_expr2.setParseAction(
-            functools.partial(_tgrep_conjunction_action, join_char=":")
+            functools.partial(_tgrep_conjunction_action, join_char=':')
         )
         tgrep_exprs.setParseAction(_tgrep_exprs_action)
-    return tgrep_exprs.ignore("#" + pyparsing.restOfLine)
+    return tgrep_exprs.ignore('#' + pyparsing.restOfLine)
 
 
 def tgrep_tokenize(tgrep_string):
-    """
+    '''
     Tokenizes a TGrep search string into separate tokens.
-    """
+    '''
     parser = _build_tgrep_parser(False)
-    if isinstance(tgrep_string, bytes):
+    if isinstance(tgrep_string, binary_type):
         tgrep_string = tgrep_string.decode()
     return list(parser.parseString(tgrep_string))
 
 
 def tgrep_compile(tgrep_string):
-    """
+    '''
     Parses (and tokenizes, if necessary) a TGrep search string into a
     lambda function.
-    """
+    '''
     parser = _build_tgrep_parser(True)
-    if isinstance(tgrep_string, bytes):
+    if isinstance(tgrep_string, binary_type):
         tgrep_string = tgrep_string.decode()
     return list(parser.parseString(tgrep_string, parseAll=True))[0]
 
 
 def treepositions_no_leaves(tree):
-    """
+    '''
     Returns all the tree positions in the given tree which are not
     leaf nodes.
-    """
+    '''
     treepositions = tree.treepositions()
     # leaves are treeposition tuples that are not prefixes of any
     # other treeposition
@@ -1005,7 +1014,7 @@ def tgrep_positions(pattern, trees, search_leaves=True):
     :rtype: iter(tree positions)
     """
 
-    if isinstance(pattern, (bytes, str)):
+    if isinstance(pattern, (binary_type, text_type)):
         pattern = tgrep_compile(pattern)
 
     for tree in trees:
@@ -1032,7 +1041,7 @@ def tgrep_nodes(pattern, trees, search_leaves=True):
     :rtype: iter(tree nodes)
     """
 
-    if isinstance(pattern, (bytes, str)):
+    if isinstance(pattern, (binary_type, text_type)):
         pattern = tgrep_compile(pattern)
 
     for tree in trees:
index 241b9f3..7068cba 100644 (file)
@@ -1,7 +1,7 @@
 # -*- coding: utf-8 -*-
 # Natural Language Toolkit: Tokenizers
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Edward Loper <edloper@gmail.com>
 #         Steven Bird <stevenbird1@gmail.com> (minor additions)
 # Contributors: matthewmc, clouds56
@@ -65,7 +65,6 @@ import re
 from nltk.data import load
 from nltk.tokenize.casual import TweetTokenizer, casual_tokenize
 from nltk.tokenize.mwe import MWETokenizer
-from nltk.tokenize.destructive import NLTKWordTokenizer
 from nltk.tokenize.punkt import PunktSentenceTokenizer
 from nltk.tokenize.regexp import (
     RegexpTokenizer,
@@ -89,11 +88,10 @@ from nltk.tokenize.toktok import ToktokTokenizer
 from nltk.tokenize.treebank import TreebankWordTokenizer
 from nltk.tokenize.util import string_span_tokenize, regexp_span_tokenize
 from nltk.tokenize.stanford_segmenter import StanfordSegmenter
-from nltk.tokenize.sonority_sequencing import SyllableTokenizer
 
 
 # Standard sentence tokenizer.
-def sent_tokenize(text, language="english"):
+def sent_tokenize(text, language='english'):
     """
     Return a sentence-tokenized copy of *text*,
     using NLTK's recommended sentence tokenizer
@@ -103,15 +101,31 @@ def sent_tokenize(text, language="english"):
     :param text: text to split into sentences
     :param language: the model name in the Punkt corpus
     """
-    tokenizer = load("tokenizers/punkt/{0}.pickle".format(language))
+    tokenizer = load('tokenizers/punkt/{0}.pickle'.format(language))
     return tokenizer.tokenize(text)
 
 
 # Standard word tokenizer.
-_treebank_word_tokenizer = NLTKWordTokenizer()
-
-
-def word_tokenize(text, language="english", preserve_line=False):
+_treebank_word_tokenizer = TreebankWordTokenizer()
+
+# See discussion on https://github.com/nltk/nltk/pull/1437
+# Adding to TreebankWordTokenizer, nltk.word_tokenize now splits on
+# - chervon quotes u'\xab' and u'\xbb' .
+# - unicode quotes u'\u2018', u'\u2019', u'\u201c' and u'\u201d'
+# See https://github.com/nltk/nltk/issues/1995#issuecomment-376741608
+# Also, behavior of splitting on clitics now follows Stanford CoreNLP
+# - clitics covered (?!re|ve|ll|m|t|s|d)(\w)\b
+improved_open_quote_regex = re.compile(u'([«“‘„]|[`]+)', re.U)
+improved_open_single_quote_regex = re.compile(r"(?i)(\')(?!re|ve|ll|m|t|s|d)(\w)\b", re.U)
+improved_close_quote_regex = re.compile(u'([»”’])', re.U)
+improved_punct_regex = re.compile(r'([^\.])(\.)([\]\)}>"\'' u'»”’ ' r']*)\s*$', re.U)
+_treebank_word_tokenizer.STARTING_QUOTES.insert(0, (improved_open_quote_regex, r' \1 '))
+_treebank_word_tokenizer.STARTING_QUOTES.append((improved_open_single_quote_regex, r'\1 \2'))
+_treebank_word_tokenizer.ENDING_QUOTES.insert(0, (improved_close_quote_regex, r' \1 '))
+_treebank_word_tokenizer.PUNCTUATION.insert(0, (improved_punct_regex, r'\1 \2 \3 '))
+
+
+def word_tokenize(text, language='english', preserve_line=False):
     """
     Return a tokenized copy of *text*,
     using NLTK's recommended word tokenizer
index 3d7a120..49a23d1 100644 (file)
Binary files a/nlp_resource_data/nltk/tokenize/__pycache__/__init__.cpython-37.pyc and b/nlp_resource_data/nltk/tokenize/__pycache__/__init__.cpython-37.pyc differ
index 6adc126..31e1a7b 100644 (file)
Binary files a/nlp_resource_data/nltk/tokenize/__pycache__/api.cpython-37.pyc and b/nlp_resource_data/nltk/tokenize/__pycache__/api.cpython-37.pyc differ
index 5598df2..410cf6e 100644 (file)
Binary files a/nlp_resource_data/nltk/tokenize/__pycache__/casual.cpython-37.pyc and b/nlp_resource_data/nltk/tokenize/__pycache__/casual.cpython-37.pyc differ
diff --git a/nlp_resource_data/nltk/tokenize/__pycache__/destructive.cpython-37.pyc b/nlp_resource_data/nltk/tokenize/__pycache__/destructive.cpython-37.pyc
deleted file mode 100644 (file)
index bfdd50a..0000000
Binary files a/nlp_resource_data/nltk/tokenize/__pycache__/destructive.cpython-37.pyc and /dev/null differ
index 00bd19a..bc99758 100644 (file)
Binary files a/nlp_resource_data/nltk/tokenize/__pycache__/mwe.cpython-37.pyc and b/nlp_resource_data/nltk/tokenize/__pycache__/mwe.cpython-37.pyc differ
index 673b0cd..9eba70b 100644 (file)
Binary files a/nlp_resource_data/nltk/tokenize/__pycache__/nist.cpython-37.pyc and b/nlp_resource_data/nltk/tokenize/__pycache__/nist.cpython-37.pyc differ
index a488cf5..2405970 100644 (file)
Binary files a/nlp_resource_data/nltk/tokenize/__pycache__/punkt.cpython-37.pyc and b/nlp_resource_data/nltk/tokenize/__pycache__/punkt.cpython-37.pyc differ
index 6f449ab..5b29816 100644 (file)
Binary files a/nlp_resource_data/nltk/tokenize/__pycache__/regexp.cpython-37.pyc and b/nlp_resource_data/nltk/tokenize/__pycache__/regexp.cpython-37.pyc differ
index a4277c4..b4ddbf5 100644 (file)
Binary files a/nlp_resource_data/nltk/tokenize/__pycache__/repp.cpython-37.pyc and b/nlp_resource_data/nltk/tokenize/__pycache__/repp.cpython-37.pyc differ
index 59ecdf1..6769cee 100644 (file)
Binary files a/nlp_resource_data/nltk/tokenize/__pycache__/sexpr.cpython-37.pyc and b/nlp_resource_data/nltk/tokenize/__pycache__/sexpr.cpython-37.pyc differ
index fd0882f..58ee439 100644 (file)
Binary files a/nlp_resource_data/nltk/tokenize/__pycache__/simple.cpython-37.pyc and b/nlp_resource_data/nltk/tokenize/__pycache__/simple.cpython-37.pyc differ
diff --git a/nlp_resource_data/nltk/tokenize/__pycache__/sonority_sequencing.cpython-37.pyc b/nlp_resource_data/nltk/tokenize/__pycache__/sonority_sequencing.cpython-37.pyc
deleted file mode 100644 (file)
index aa2a6dd..0000000
Binary files a/nlp_resource_data/nltk/tokenize/__pycache__/sonority_sequencing.cpython-37.pyc and /dev/null differ
index eaa06cf..99fb947 100644 (file)
Binary files a/nlp_resource_data/nltk/tokenize/__pycache__/stanford.cpython-37.pyc and b/nlp_resource_data/nltk/tokenize/__pycache__/stanford.cpython-37.pyc differ
index ba109a5..189a51b 100644 (file)
Binary files a/nlp_resource_data/nltk/tokenize/__pycache__/stanford_segmenter.cpython-37.pyc and b/nlp_resource_data/nltk/tokenize/__pycache__/stanford_segmenter.cpython-37.pyc differ
index 1c15047..be34b44 100644 (file)
Binary files a/nlp_resource_data/nltk/tokenize/__pycache__/texttiling.cpython-37.pyc and b/nlp_resource_data/nltk/tokenize/__pycache__/texttiling.cpython-37.pyc differ
index d0901ee..c5cd322 100644 (file)
Binary files a/nlp_resource_data/nltk/tokenize/__pycache__/toktok.cpython-37.pyc and b/nlp_resource_data/nltk/tokenize/__pycache__/toktok.cpython-37.pyc differ
index 04e5241..16b629f 100644 (file)
Binary files a/nlp_resource_data/nltk/tokenize/__pycache__/treebank.cpython-37.pyc and b/nlp_resource_data/nltk/tokenize/__pycache__/treebank.cpython-37.pyc differ
index 119bd0e..5ad36ea 100644 (file)
Binary files a/nlp_resource_data/nltk/tokenize/__pycache__/util.cpython-37.pyc and b/nlp_resource_data/nltk/tokenize/__pycache__/util.cpython-37.pyc differ
index 316e385..476db21 100644 (file)
@@ -1,6 +1,6 @@
 # Natural Language Toolkit: Tokenizer Interface
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Edward Loper <edloper@gmail.com>
 #         Steven Bird <stevenbird1@gmail.com>
 # URL: <http://nltk.org/>
 Tokenizer Interface
 """
 
-from abc import ABC, abstractmethod
+from abc import ABCMeta, abstractmethod
+from six import add_metaclass
 
 from nltk.internals import overridden
 from nltk.tokenize.util import string_span_tokenize
 
 
-class TokenizerI(ABC):
+@add_metaclass(ABCMeta)
+class TokenizerI(object):
     """
     A processing interface for tokenizing a string.
     Subclasses must define ``tokenize()`` or ``tokenize_sents()`` (or both).
@@ -68,11 +70,6 @@ class StringTokenizer(TokenizerI):
     on the specified string (defined in subclasses).
     """
 
-    @property
-    @abstractmethod
-    def _string(self):
-        raise NotImplementedError
-
     def tokenize(self, s):
         return s.split(self._string)
 
index 9187cd1..edc82f2 100644 (file)
@@ -2,7 +2,7 @@
 #
 # Natural Language Toolkit: Twitter Tokenizer
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Christopher Potts <cgpotts@stanford.edu>
 #         Ewan Klein <ewan@inf.ed.ac.uk> (modifications)
 #         Pierpaolo Pantone <> (modifications)
@@ -35,8 +35,11 @@ domains and tasks. The basic logic is this:
 
 ######################################################################
 
-import regex  # https://github.com/nltk/nltk/issues/2409
-import html
+from __future__ import unicode_literals
+import re
+
+from six import int2byte, unichr
+from six.moves import html_entities
 
 ######################################################################
 # The following strings are components in the regular expression
@@ -163,17 +166,17 @@ REGEXPS = (
 ######################################################################
 # This is the core tokenizing regex:
 
-WORD_RE = regex.compile(r"""(%s)""" % "|".join(REGEXPS), regex.VERBOSE | regex.I | regex.UNICODE)
+WORD_RE = re.compile(r"""(%s)""" % "|".join(REGEXPS), re.VERBOSE | re.I | re.UNICODE)
 
 # WORD_RE performs poorly on these patterns:
-HANG_RE = regex.compile(r"([^a-zA-Z0-9])\1{3,}")
+HANG_RE = re.compile(r'([^a-zA-Z0-9])\1{3,}')
 
 # The emoticon string gets its own regex so that we can preserve case for
 # them as needed:
-EMOTICON_RE = regex.compile(EMOTICONS, regex.VERBOSE | regex.I | regex.UNICODE)
+EMOTICON_RE = re.compile(EMOTICONS, re.VERBOSE | re.I | re.UNICODE)
 
 # These are for regularizing HTML entities to Unicode:
-ENT_RE = regex.compile(r"&(#?(x?))([^&;\s]+);")
+ENT_RE = re.compile(r'&(#?(x?))([^&;\s]+);')
 
 
 ######################################################################
@@ -181,15 +184,15 @@ ENT_RE = regex.compile(r"&(#?(x?))([^&;\s]+);")
 ######################################################################
 
 
-def _str_to_unicode(text, encoding=None, errors="strict"):
+def _str_to_unicode(text, encoding=None, errors='strict'):
     if encoding is None:
-        encoding = "utf-8"
+        encoding = 'utf-8'
     if isinstance(text, bytes):
         return text.decode(encoding, errors)
     return text
 
 
-def _replace_html_entities(text, keep=(), remove_illegal=True, encoding="utf-8"):
+def _replace_html_entities(text, keep=(), remove_illegal=True, encoding='utf-8'):
     """
     Remove entities from text by converting them to their
     corresponding unicode character.
@@ -228,19 +231,19 @@ def _replace_html_entities(text, keep=(), remove_illegal=True, encoding="utf-8")
                 # Numeric character references in the 80-9F range are typically
                 # interpreted by browsers as representing the characters mapped
                 # to bytes 80-9F in the Windows-1252 encoding. For more info
-                # see: https://en.wikipedia.org/wiki/ISO/IEC_8859-1#Similar_character_sets
+                # see: http://en.wikipedia.org/wiki/Character_encodings_in_HTML
                 if 0x80 <= number <= 0x9F:
-                    return bytes((number,)).decode("cp1252")
+                    return int2byte(number).decode('cp1252')
             except ValueError:
                 number = None
         else:
             if entity_body in keep:
                 return match.group(0)
             else:
-                number = html.entities.name2codepoint.get(entity_body)
+                number = html_entities.name2codepoint.get(entity_body)
         if number is not None:
             try:
-                return chr(number)
+                return unichr(number)
             except ValueError:
                 pass
 
@@ -291,7 +294,7 @@ class TweetTokenizer:
         if self.reduce_len:
             text = reduce_lengthening(text)
         # Shorten problematic sequences of characters
-        safe_text = HANG_RE.sub(r"\1\1\1", text)
+        safe_text = HANG_RE.sub(r'\1\1\1', text)
         # Tokenize:
         words = WORD_RE.findall(safe_text)
         # Possibly alter the case, but avoid changing emoticons like :D into :d:
@@ -312,7 +315,7 @@ def reduce_lengthening(text):
     Replace repeated character sequences of length 3 or greater with sequences
     of length 3.
     """
-    pattern = regex.compile(r"(.)\1{2,}")
+    pattern = re.compile(r"(.)\1{2,}")
     return pattern.sub(r"\1\1\1", text)
 
 
@@ -320,11 +323,11 @@ def remove_handles(text):
     """
     Remove Twitter username handles from text.
     """
-    pattern = regex.compile(
+    pattern = re.compile(
         r"(?<![A-Za-z0-9_!@#\$%&*])@(([A-Za-z0-9_]){20}(?!@))|(?<![A-Za-z0-9_!@#\$%&*])@(([A-Za-z0-9_]){1,19})(?![A-Za-z0-9_]*@)"
     )
-    # Substitute handles with ' ' to ensure that text on either side of removed handles are tokenized correctly
-    return pattern.sub(" ", text)
+    # Substitute hadnles with ' ' to ensure that text on either side of removed handles are tokenized correctly
+    return pattern.sub(' ', text)
 
 
 ######################################################################
diff --git a/nlp_resource_data/nltk/tokenize/destructive.py b/nlp_resource_data/nltk/tokenize/destructive.py
deleted file mode 100644 (file)
index 5cb524f..0000000
+++ /dev/null
@@ -1,141 +0,0 @@
-# Natural Language Toolkit: NLTK's very own tokenizer.
-#
-# Copyright (C) 2001-2020 NLTK Project
-# Author:
-# URL: <http://nltk.sourceforge.net>
-# For license information, see LICENSE.TXT
-
-
-import re
-from nltk.tokenize.api import TokenizerI
-
-
-class MacIntyreContractions:
-    """
-    List of contractions adapted from Robert MacIntyre's tokenizer.
-    """
-
-    CONTRACTIONS2 = [
-        r"(?i)\b(can)(?#X)(not)\b",
-        r"(?i)\b(d)(?#X)('ye)\b",
-        r"(?i)\b(gim)(?#X)(me)\b",
-        r"(?i)\b(gon)(?#X)(na)\b",
-        r"(?i)\b(got)(?#X)(ta)\b",
-        r"(?i)\b(lem)(?#X)(me)\b",
-        r"(?i)\b(mor)(?#X)('n)\b",
-        r"(?i)\b(wan)(?#X)(na)\s",
-    ]
-    CONTRACTIONS3 = [r"(?i) ('t)(?#X)(is)\b", r"(?i) ('t)(?#X)(was)\b"]
-    CONTRACTIONS4 = [r"(?i)\b(whad)(dd)(ya)\b", r"(?i)\b(wha)(t)(cha)\b"]
-
-
-class NLTKWordTokenizer(TokenizerI):
-    """
-    The NLTK tokenizer that has improved upon the TreebankWordTokenizer.
-
-    The tokenizer is "destructive" such that the regexes applied will munge the
-    input string to a state beyond re-construction. It is possible to apply
-    `TreebankWordDetokenizer.detokenize` to the tokenized outputs of
-    `NLTKDestructiveWordTokenizer.tokenize` but there's no guarantees to
-    revert to the original string.
-    """
-
-    # Starting quotes.
-    STARTING_QUOTES = [
-        (re.compile(u"([«“‘„]|[`]+)", re.U), r" \1 "),
-        (re.compile(r"^\""), r"``"),
-        (re.compile(r"(``)"), r" \1 "),
-        (re.compile(r"([ \(\[{<])(\"|\'{2})"), r"\1 `` "),
-        (re.compile(r"(?i)(\')(?!re|ve|ll|m|t|s|d)(\w)\b", re.U), r"\1 \2"),
-    ]
-
-    # Ending quotes.
-    ENDING_QUOTES = [
-        (re.compile(u"([»”’])", re.U), r" \1 "),
-        (re.compile(r'"'), " '' "),
-        (re.compile(r"(\S)(\'\')"), r"\1 \2 "),
-        (re.compile(r"([^' ])('[sS]|'[mM]|'[dD]|') "), r"\1 \2 "),
-        (re.compile(r"([^' ])('ll|'LL|'re|'RE|'ve|'VE|n't|N'T) "), r"\1 \2 "),
-    ]
-
-    # For improvements for starting/closing quotes from TreebankWordTokenizer,
-    # see discussion on https://github.com/nltk/nltk/pull/1437
-    # Adding to TreebankWordTokenizer, nltk.word_tokenize now splits on
-    # - chervon quotes u'\xab' and u'\xbb' .
-    # - unicode quotes u'\u2018', u'\u2019', u'\u201c' and u'\u201d'
-    # See https://github.com/nltk/nltk/issues/1995#issuecomment-376741608
-    # Also, behavior of splitting on clitics now follows Stanford CoreNLP
-    # - clitics covered (?!re|ve|ll|m|t|s|d)(\w)\b
-
-    # Punctuation.
-    PUNCTUATION = [
-        (re.compile(r'([^\.])(\.)([\]\)}>"\'' u"»”’ " r"]*)\s*$", re.U), r"\1 \2 \3 "),
-        (re.compile(r"([:,])([^\d])"), r" \1 \2"),
-        (re.compile(r"([:,])$"), r" \1 "),
-        (re.compile(r"\.{2,}", re.U), r" \g<0> "), # See https://github.com/nltk/nltk/pull/2322
-        (re.compile(r"[;@#$%&]"), r" \g<0> "),
-        (
-            re.compile(r'([^\.])(\.)([\]\)}>"\']*)\s*$'),
-            r"\1 \2\3 ",
-        ),  # Handles the final period.
-        (re.compile(r"[?!]"), r" \g<0> "),
-        (re.compile(r"([^'])' "), r"\1 ' "),
-        (re.compile(r"[*]", re.U), r" \g<0> "), # See https://github.com/nltk/nltk/pull/2322
-    ]
-
-    # Pads parentheses
-    PARENS_BRACKETS = (re.compile(r"[\]\[\(\)\{\}\<\>]"), r" \g<0> ")
-
-    # Optionally: Convert parentheses, brackets and converts them to PTB symbols.
-    CONVERT_PARENTHESES = [
-        (re.compile(r"\("), "-LRB-"),
-        (re.compile(r"\)"), "-RRB-"),
-        (re.compile(r"\["), "-LSB-"),
-        (re.compile(r"\]"), "-RSB-"),
-        (re.compile(r"\{"), "-LCB-"),
-        (re.compile(r"\}"), "-RCB-"),
-    ]
-
-    DOUBLE_DASHES = (re.compile(r"--"), r" -- ")
-
-    # List of contractions adapted from Robert MacIntyre's tokenizer.
-    _contractions = MacIntyreContractions()
-    CONTRACTIONS2 = list(map(re.compile, _contractions.CONTRACTIONS2))
-    CONTRACTIONS3 = list(map(re.compile, _contractions.CONTRACTIONS3))
-
-    def tokenize(self, text, convert_parentheses=False, return_str=False):
-        for regexp, substitution in self.STARTING_QUOTES:
-            text = regexp.sub(substitution, text)
-
-        for regexp, substitution in self.PUNCTUATION:
-            text = regexp.sub(substitution, text)
-
-        # Handles parentheses.
-        regexp, substitution = self.PARENS_BRACKETS
-        text = regexp.sub(substitution, text)
-        # Optionally convert parentheses
-        if convert_parentheses:
-            for regexp, substitution in self.CONVERT_PARENTHESES:
-                text = regexp.sub(substitution, text)
-
-        # Handles double dash.
-        regexp, substitution = self.DOUBLE_DASHES
-        text = regexp.sub(substitution, text)
-
-        # add extra space to make things easier
-        text = " " + text + " "
-
-        for regexp, substitution in self.ENDING_QUOTES:
-            text = regexp.sub(substitution, text)
-
-        for regexp in self.CONTRACTIONS2:
-            text = regexp.sub(r" \1 \2 ", text)
-        for regexp in self.CONTRACTIONS3:
-            text = regexp.sub(r" \1 \2 ", text)
-
-        # We are not using CONTRACTIONS4 since
-        # they are also commented out in the SED scripts
-        # for regexp in self._contractions.CONTRACTIONS4:
-        #     text = regexp.sub(r' \1 \2 \3 ', text)
-
-        return text if return_str else text.split()
index 9e4b991..5c61363 100644 (file)
@@ -1,6 +1,6 @@
 # Multi-Word Expression tokenizer
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Rob Malouf <rmalouf@mail.sdsu.edu>
 # URL: <http://nltk.org/>
 # For license information, see LICENSE.TXT
@@ -38,7 +38,7 @@ class MWETokenizer(TokenizerI):
     into single tokens.
     """
 
-    def __init__(self, mwes=None, separator="_"):
+    def __init__(self, mwes=None, separator='_'):
         """Initialize the multi-word tokenizer with a list of expressions and a
         separator
 
index e6b7491..28d7e08 100644 (file)
@@ -15,9 +15,11 @@ which was also ported into Python in
 https://github.com/lium-lst/nmtpy/blob/master/nmtpy/metrics/mtevalbleu.py#L162
 """
 
+from __future__ import unicode_literals
 
 import io
 import re
+from six import text_type
 
 from nltk.corpus import perluniprops
 from nltk.tokenize.api import TokenizerI
@@ -30,6 +32,7 @@ class NISTTokenizer(TokenizerI):
     paragraph-based tokenization from mteval-14.pl; The sentence-based
     tokenization is consistent with the other tokenizers available in NLTK.
 
+    >>> from six import text_type
     >>> from nltk.tokenize.nist import NISTTokenizer
     >>> nist = NISTTokenizer()
     >>> s = "Good muffins cost $3.88 in New York."
@@ -71,17 +74,17 @@ class NISTTokenizer(TokenizerI):
     """
 
     # Strip "skipped" tags
-    STRIP_SKIP = re.compile("<skipped>"), ""
+    STRIP_SKIP = re.compile('<skipped>'), ''
     #  Strip end-of-line hyphenation and join lines
-    STRIP_EOL_HYPHEN = re.compile("\u2028"), " "
+    STRIP_EOL_HYPHEN = re.compile(u'\u2028'), ' '
     # Tokenize punctuation.
-    PUNCT = re.compile("([\{-\~\[-\` -\&\(-\+\:-\@\/])"), " \\1 "
+    PUNCT = re.compile('([\{-\~\[-\` -\&\(-\+\:-\@\/])'), ' \\1 '
     # Tokenize period and comma unless preceded by a digit.
-    PERIOD_COMMA_PRECEED = re.compile("([^0-9])([\.,])"), "\\1 \\2 "
+    PERIOD_COMMA_PRECEED = re.compile('([^0-9])([\.,])'), '\\1 \\2 '
     # Tokenize period and comma unless followed by a digit.
-    PERIOD_COMMA_FOLLOW = re.compile("([\.,])([^0-9])"), " \\1 \\2"
+    PERIOD_COMMA_FOLLOW = re.compile('([\.,])([^0-9])'), ' \\1 \\2'
     # Tokenize dash when preceded by a digit
-    DASH_PRECEED_DIGIT = re.compile("([0-9])(-)"), "\\1 \\2 "
+    DASH_PRECEED_DIGIT = re.compile('([0-9])(-)'), '\\1 \\2 '
 
     LANG_DEPENDENT_REGEXES = [
         PUNCT,
@@ -91,37 +94,37 @@ class NISTTokenizer(TokenizerI):
     ]
 
     # Perluniprops characters used in NIST tokenizer.
-    pup_number = str("".join(set(perluniprops.chars("Number"))))  # i.e. \p{N}
-    pup_punct = str("".join(set(perluniprops.chars("Punctuation"))))  # i.e. \p{P}
-    pup_symbol = str("".join(set(perluniprops.chars("Symbol"))))  # i.e. \p{S}
+    pup_number = text_type(''.join(set(perluniprops.chars('Number'))))  # i.e. \p{N}
+    pup_punct = text_type(''.join(set(perluniprops.chars('Punctuation'))))  # i.e. \p{P}
+    pup_symbol = text_type(''.join(set(perluniprops.chars('Symbol'))))  # i.e. \p{S}
 
     # Python regexes needs to escape some special symbols, see
     # see https://stackoverflow.com/q/45670950/610569
-    number_regex = re.sub(r"[]^\\-]", r"\\\g<0>", pup_number)
-    punct_regex = re.sub(r"[]^\\-]", r"\\\g<0>", pup_punct)
-    symbol_regex = re.sub(r"[]^\\-]", r"\\\g<0>", pup_symbol)
+    number_regex = re.sub(r'[]^\\-]', r'\\\g<0>', pup_number)
+    punct_regex = re.sub(r'[]^\\-]', r'\\\g<0>', pup_punct)
+    symbol_regex = re.sub(r'[]^\\-]', r'\\\g<0>', pup_symbol)
 
     # Note: In the original perl implementation, \p{Z} and \p{Zl} were used to
     #       (i) strip trailing and heading spaces  and
     #       (ii) de-deuplicate spaces.
     #       In Python, this would do: ' '.join(str.strip().split())
     # Thus, the next two lines were commented out.
-    # Line_Separator = str(''.join(perluniprops.chars('Line_Separator'))) # i.e. \p{Zl}
-    # Separator = str(''.join(perluniprops.chars('Separator'))) # i.e. \p{Z}
+    # Line_Separator = text_type(''.join(perluniprops.chars('Line_Separator'))) # i.e. \p{Zl}
+    # Separator = text_type(''.join(perluniprops.chars('Separator'))) # i.e. \p{Z}
 
     # Pads non-ascii strings with space.
-    NONASCII = re.compile("([\x00-\x7f]+)"), r" \1 "
+    NONASCII = re.compile('([\x00-\x7f]+)'), r' \1 '
     #  Tokenize any punctuation unless followed AND preceded by a digit.
     PUNCT_1 = (
-        re.compile("([{n}])([{p}])".format(n=number_regex, p=punct_regex)),
-        "\\1 \\2 ",
+        re.compile(u"([{n}])([{p}])".format(n=number_regex, p=punct_regex)),
+        '\\1 \\2 ',
     )
     PUNCT_2 = (
-        re.compile("([{p}])([{n}])".format(n=number_regex, p=punct_regex)),
-        " \\1 \\2",
+        re.compile(u"([{p}])([{n}])".format(n=number_regex, p=punct_regex)),
+        ' \\1 \\2',
     )
     # Tokenize symbols
-    SYMBOLS = re.compile("([{s}])".format(s=symbol_regex)), " \\1 "
+    SYMBOLS = re.compile(u"([{s}])".format(s=symbol_regex)), ' \\1 '
 
     INTERNATIONAL_REGEXES = [NONASCII, PUNCT_1, PUNCT_2, SYMBOLS]
 
@@ -138,28 +141,28 @@ class NISTTokenizer(TokenizerI):
         return text
 
     def tokenize(self, text, lowercase=False, western_lang=True, return_str=False):
-        text = str(text)
+        text = text_type(text)
         # Language independent regex.
         text = self.lang_independent_sub(text)
         # Language dependent regex.
         if western_lang:
             # Pad string with whitespace.
-            text = " " + text + " "
+            text = ' ' + text + ' '
             if lowercase:
                 text = text.lower()
             for regexp, substitution in self.LANG_DEPENDENT_REGEXES:
                 text = regexp.sub(substitution, text)
         # Remove contiguous whitespaces.
-        text = " ".join(text.split())
+        text = ' '.join(text.split())
         # Finally, strips heading and trailing spaces
         # and converts output string into unicode.
-        text = str(text.strip())
+        text = text_type(text.strip())
         return text if return_str else text.split()
 
     def international_tokenize(
         self, text, lowercase=False, split_non_ascii=True, return_str=False
     ):
-        text = str(text)
+        text = text_type(text)
         # Different from the 'normal' tokenize(), STRIP_EOL_HYPHEN is applied
         # first before unescaping.
         regexp, substitution = self.STRIP_SKIP
@@ -176,5 +179,5 @@ class NISTTokenizer(TokenizerI):
 
         # Make sure that there's only one space only between words.
         # Strip leading and trailing spaces.
-        text = " ".join(text.strip().split())
+        text = ' '.join(text.strip().split())
         return text if return_str else text.split()
index 408ce27..76fd868 100644 (file)
@@ -1,6 +1,6 @@
 # Natural Language Toolkit: Punkt sentence tokenizer
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Algorithm: Kiss & Strunk (2006)
 # Author: Willy <willy@csse.unimelb.edu.au> (original Python port)
 #         Steven Bird <stevenbird1@gmail.com> (additions)
@@ -99,6 +99,7 @@ The algorithm for this tokenizer is described in::
   Kiss, Tibor and Strunk, Jan (2006): Unsupervised Multilingual Sentence
     Boundary Detection.  Computational Linguistics 32: 485-525.
 """
+from __future__ import print_function, unicode_literals, division
 
 # TODO: Make orthographic heuristic less susceptible to overtraining
 # TODO: Frequent sentence starters optionally exclude always-capitalised words
@@ -108,6 +109,9 @@ import re
 import math
 from collections import defaultdict
 
+from six import string_types
+
+from nltk.compat import unicode_repr, python_2_unicode_compatible
 from nltk.probability import FreqDist
 from nltk.tokenize.api import TokenizerI
 
@@ -143,12 +147,12 @@ _ORTHO_LC = _ORTHO_BEG_LC + _ORTHO_MID_LC + _ORTHO_UNK_LC
 """Orthographic context: occurs with lower case."""
 
 _ORTHO_MAP = {
-    ("initial", "upper"): _ORTHO_BEG_UC,
-    ("internal", "upper"): _ORTHO_MID_UC,
-    ("unknown", "upper"): _ORTHO_UNK_UC,
-    ("initial", "lower"): _ORTHO_BEG_LC,
-    ("internal", "lower"): _ORTHO_MID_LC,
-    ("unknown", "lower"): _ORTHO_UNK_LC,
+    ('initial', 'upper'): _ORTHO_BEG_UC,
+    ('internal', 'upper'): _ORTHO_MID_UC,
+    ('unknown', 'upper'): _ORTHO_UNK_UC,
+    ('initial', 'lower'): _ORTHO_BEG_LC,
+    ('internal', 'lower'): _ORTHO_MID_LC,
+    ('unknown', 'lower'): _ORTHO_UNK_LC,
 }
 """A map from context position and first-letter case to the
 appropriate orthographic context flag."""
@@ -160,14 +164,14 @@ appropriate orthographic context flag."""
 # { Decision reasons for debugging
 ######################################################################
 
-REASON_DEFAULT_DECISION = "default decision"
-REASON_KNOWN_COLLOCATION = "known collocation (both words)"
-REASON_ABBR_WITH_ORTHOGRAPHIC_HEURISTIC = "abbreviation + orthographic heuristic"
-REASON_ABBR_WITH_SENTENCE_STARTER = "abbreviation + frequent sentence starter"
-REASON_INITIAL_WITH_ORTHOGRAPHIC_HEURISTIC = "initial + orthographic heuristic"
-REASON_NUMBER_WITH_ORTHOGRAPHIC_HEURISTIC = "initial + orthographic heuristic"
+REASON_DEFAULT_DECISION = 'default decision'
+REASON_KNOWN_COLLOCATION = 'known collocation (both words)'
+REASON_ABBR_WITH_ORTHOGRAPHIC_HEURISTIC = 'abbreviation + orthographic heuristic'
+REASON_ABBR_WITH_SENTENCE_STARTER = 'abbreviation + frequent sentence starter'
+REASON_INITIAL_WITH_ORTHOGRAPHIC_HEURISTIC = 'initial + orthographic heuristic'
+REASON_NUMBER_WITH_ORTHOGRAPHIC_HEURISTIC = 'initial + orthographic heuristic'
 REASON_INITIAL_WITH_SPECIAL_ORTHOGRAPHIC_HEURISTIC = (
-    "initial + special orthographic heuristic"
+    'initial + special orthographic heuristic'
 )
 
 
@@ -189,7 +193,7 @@ class PunktLanguageVars(object):
     constructors.
     """
 
-    __slots__ = ("_re_period_context", "_re_word_tokenizer")
+    __slots__ = ('_re_period_context', '_re_word_tokenizer')
 
     def __getstate__(self):
         # All modifications to the class are performed by inheritance.
@@ -200,14 +204,14 @@ class PunktLanguageVars(object):
     def __setstate__(self, state):
         return 1
 
-    sent_end_chars = (".", "?", "!")
+    sent_end_chars = ('.', '?', '!')
     """Characters which are candidates for sentence boundaries"""
 
     @property
     def _re_sent_end_chars(self):
-        return "[%s]" % re.escape("".join(self.sent_end_chars))
+        return '[%s]' % re.escape(''.join(self.sent_end_chars))
 
-    internal_punctuation = ",:;"  # might want to extend this..
+    internal_punctuation = ',:;'  # might want to extend this..
     """sentence internal punctuation, which indicates an abbreviation if
     preceded by a period-final token."""
 
@@ -224,7 +228,7 @@ class PunktLanguageVars(object):
     _re_multi_char_punct = r"(?:\-{2,}|\.{2,}|(?:\.\s){2,}\.)"
     """Hyphen and ellipsis are multi-character punctuation"""
 
-    _word_tokenize_fmt = r"""(
+    _word_tokenize_fmt = r'''(
         %(MultiChar)s
         |
         (?=%(WordStart)s)\S+?  # Accept word characters until end is found
@@ -236,7 +240,7 @@ class PunktLanguageVars(object):
         )
         |
         \S
-    )"""
+    )'''
     """Format of a regular expression to split punctuation from words,
     excluding period."""
 
@@ -248,9 +252,9 @@ class PunktLanguageVars(object):
             self._re_word_tokenizer = re.compile(
                 self._word_tokenize_fmt
                 % {
-                    "NonWord": self._re_non_word_chars,
-                    "MultiChar": self._re_multi_char_punct,
-                    "WordStart": self._re_word_start,
+                    'NonWord': self._re_non_word_chars,
+                    'MultiChar': self._re_multi_char_punct,
+                    'WordStart': self._re_word_start,
                 },
                 re.UNICODE | re.VERBOSE,
             )
@@ -281,15 +285,15 @@ class PunktLanguageVars(object):
             self._re_period_context = re.compile(
                 self._period_context_fmt
                 % {
-                    "NonWord": self._re_non_word_chars,
-                    "SentEndChars": self._re_sent_end_chars,
+                    'NonWord': self._re_non_word_chars,
+                    'SentEndChars': self._re_sent_end_chars,
                 },
                 re.UNICODE | re.VERBOSE,
             )
             return self._re_period_context
 
 
-_re_non_punct = re.compile(r"[^\W\d]", re.UNICODE)
+_re_non_punct = re.compile(r'[^\W\d]', re.UNICODE)
 """Matches token types that are not merely punctuation. (Types for
 numeric tokens are changed to ##number## and hence contain alpha.)"""
 
@@ -310,10 +314,7 @@ def _pair_iter(it):
     pair will have None as its second element.
     """
     it = iter(it)
-    try:
-        prev = next(it)
-    except StopIteration:
-        return
+    prev = next(it)
     for el in it:
         yield (prev, el)
         prev = el
@@ -366,17 +367,17 @@ class PunktParameters(object):
     def _debug_ortho_context(self, typ):
         c = self.ortho_context[typ]
         if c & _ORTHO_BEG_UC:
-            yield "BEG-UC"
+            yield 'BEG-UC'
         if c & _ORTHO_MID_UC:
-            yield "MID-UC"
+            yield 'MID-UC'
         if c & _ORTHO_UNK_UC:
-            yield "UNK-UC"
+            yield 'UNK-UC'
         if c & _ORTHO_BEG_LC:
-            yield "BEG-LC"
+            yield 'BEG-LC'
         if c & _ORTHO_MID_LC:
-            yield "MID-LC"
+            yield 'MID-LC'
         if c & _ORTHO_UNK_LC:
-            yield "UNK-LC"
+            yield 'UNK-LC'
 
 
 ######################################################################
@@ -384,17 +385,18 @@ class PunktParameters(object):
 ######################################################################
 
 
+@python_2_unicode_compatible
 class PunktToken(object):
     """Stores a token of text with annotations produced during
     sentence boundary detection."""
 
-    _properties = ["parastart", "linestart", "sentbreak", "abbr", "ellipsis"]
-    __slots__ = ["tok", "type", "period_final"] + _properties
+    _properties = ['parastart', 'linestart', 'sentbreak', 'abbr', 'ellipsis']
+    __slots__ = ['tok', 'type', 'period_final'] + _properties
 
     def __init__(self, tok, **params):
         self.tok = tok
         self.type = self._get_type(tok)
-        self.period_final = tok.endswith(".")
+        self.period_final = tok.endswith('.')
 
         for p in self._properties:
             setattr(self, p, None)
@@ -405,10 +407,10 @@ class PunktToken(object):
     # { Regular expressions for properties
     # ////////////////////////////////////////////////////////////
     # Note: [A-Za-z] is approximated by [^\W\d] in the general case.
-    _RE_ELLIPSIS = re.compile(r"\.\.+$")
-    _RE_NUMERIC = re.compile(r"^-?[\.,]?\d[\d,\.-]*\.?$")
-    _RE_INITIAL = re.compile(r"[^\W\d]\.$", re.UNICODE)
-    _RE_ALPHA = re.compile(r"[^\W\d]+$", re.UNICODE)
+    _RE_ELLIPSIS = re.compile(r'\.\.+$')
+    _RE_NUMERIC = re.compile(r'^-?[\.,]?\d[\d,\.-]*\.?$')
+    _RE_INITIAL = re.compile(r'[^\W\d]\.$', re.UNICODE)
+    _RE_ALPHA = re.compile(r'[^\W\d]+$', re.UNICODE)
 
     # ////////////////////////////////////////////////////////////
     # { Derived properties
@@ -416,14 +418,14 @@ class PunktToken(object):
 
     def _get_type(self, tok):
         """Returns a case-normalized representation of the token."""
-        return self._RE_NUMERIC.sub("##number##", tok.lower())
+        return self._RE_NUMERIC.sub('##number##', tok.lower())
 
     @property
     def type_no_period(self):
         """
         The type with its final period removed if it has one.
         """
-        if len(self.type) > 1 and self.type[-1] == ".":
+        if len(self.type) > 1 and self.type[-1] == '.':
             return self.type[:-1]
         return self.type
 
@@ -450,10 +452,10 @@ class PunktToken(object):
     @property
     def first_case(self):
         if self.first_lower:
-            return "lower"
+            return 'lower'
         elif self.first_upper:
-            return "upper"
-        return "none"
+            return 'upper'
+        return 'none'
 
     @property
     def is_ellipsis(self):
@@ -463,7 +465,7 @@ class PunktToken(object):
     @property
     def is_number(self):
         """True if the token text is that of a number."""
-        return self.type.startswith("##number##")
+        return self.type.startswith('##number##')
 
     @property
     def is_initial(self):
@@ -490,17 +492,17 @@ class PunktToken(object):
         with eval(), which lists all the token's non-default
         annotations.
         """
-        typestr = " type=%s," % repr(self.type) if self.type != self.tok else ""
+        typestr = ' type=%s,' % unicode_repr(self.type) if self.type != self.tok else ''
 
-        propvals = ", ".join(
-            "%s=%s" % (p, repr(getattr(self, p)))
+        propvals = ', '.join(
+            '%s=%s' % (p, unicode_repr(getattr(self, p)))
             for p in self._properties
             if getattr(self, p)
         )
 
-        return "%s(%s,%s %s)" % (
+        return '%s(%s,%s %s)' % (
             self.__class__.__name__,
-            repr(self.tok),
+            unicode_repr(self.tok),
             typestr,
             propvals,
         )
@@ -511,11 +513,11 @@ class PunktToken(object):
         """
         res = self.tok
         if self.abbr:
-            res += "<A>"
+            res += '<A>'
         if self.ellipsis:
-            res += "<E>"
+            res += '<E>'
         if self.sentbreak:
-            res += "<S>"
+            res += '<S>'
         return res
 
 
@@ -553,16 +555,11 @@ class PunktBaseClass(object):
         respectively.
         """
         parastart = False
-        for line in plaintext.split("\n"):
+        for line in plaintext.split('\n'):
             if line.strip():
                 line_toks = iter(self._lang_vars.word_tokenize(line))
 
-                try:
-                    tok = next(line_toks)
-                except StopIteration:
-                    continue
-
-                yield self._Token(tok, parastart=parastart, linestart=True)
+                yield self._Token(next(line_toks), parastart=parastart, linestart=True)
                 parastart = False
 
                 for t in line_toks:
@@ -606,10 +603,10 @@ class PunktBaseClass(object):
             aug_tok.sentbreak = True
         elif aug_tok.is_ellipsis:
             aug_tok.ellipsis = True
-        elif aug_tok.period_final and not tok.endswith(".."):
+        elif aug_tok.period_final and not tok.endswith('..'):
             if (
                 tok[:-1].lower() in self._params.abbrev_types
-                or tok[:-1].lower().split("-")[-1] in self._params.abbrev_types
+                or tok[:-1].lower().split('-')[-1] in self._params.abbrev_types
             ):
 
                 aug_tok.abbr = True
@@ -763,12 +760,12 @@ class PunktTrainer(PunktBaseClass):
                 if is_add:
                     self._params.abbrev_types.add(abbr)
                     if verbose:
-                        print(("  Abbreviation: [%6.4f] %s" % (score, abbr)))
+                        print(('  Abbreviation: [%6.4f] %s' % (score, abbr)))
             else:
                 if not is_add:
                     self._params.abbrev_types.remove(abbr)
                     if verbose:
-                        print(("  Removed abbreviation: [%6.4f] %s" % (score, abbr)))
+                        print(('  Removed abbreviation: [%6.4f] %s' % (score, abbr)))
 
         # Make a preliminary pass through the document, marking likely
         # sentence breaks, abbreviations, and ellipsis tokens.
@@ -791,7 +788,7 @@ class PunktTrainer(PunktBaseClass):
             if self._is_rare_abbrev_type(aug_tok1, aug_tok2):
                 self._params.abbrev_types.add(aug_tok1.type_no_period)
                 if verbose:
-                    print(("  Rare Abbrev: %s" % aug_tok1.type))
+                    print(('  Rare Abbrev: %s' % aug_tok1.type))
 
             # Does second token have a high likelihood of starting a sentence?
             if self._is_potential_sent_starter(aug_tok2, aug_tok1):
@@ -815,13 +812,13 @@ class PunktTrainer(PunktBaseClass):
         for typ, ll in self._find_sent_starters():
             self._params.sent_starters.add(typ)
             if verbose:
-                print(("  Sent Starter: [%6.4f] %r" % (ll, typ)))
+                print(('  Sent Starter: [%6.4f] %r' % (ll, typ)))
 
         self._params.clear_collocations()
         for (typ1, typ2), ll in self._find_collocations():
             self._params.collocations.add((typ1, typ2))
             if verbose:
-                print(("  Collocation: [%6.4f] %r+%r" % (ll, typ1, typ2)))
+                print(('  Collocation: [%6.4f] %r+%r' % (ll, typ1, typ2)))
 
         self._finalized = True
 
@@ -884,7 +881,7 @@ class PunktTrainer(PunktBaseClass):
         positions.
         """
         # 'initial' or 'internal' or 'unknown'
-        context = "internal"
+        context = 'internal'
         tokens = list(tokens)
 
         for aug_tok in tokens:
@@ -892,13 +889,13 @@ class PunktTrainer(PunktBaseClass):
             # that it's a sentence break.  But err on the side of
             # caution (by not positing a sentence break) if we just
             # saw an abbreviation.
-            if aug_tok.parastart and context != "unknown":
-                context = "initial"
+            if aug_tok.parastart and context != 'unknown':
+                context = 'initial'
 
             # If we're at the beginning of a line, then we can't decide
             # between 'internal' and 'initial'.
-            if aug_tok.linestart and context == "internal":
-                context = "unknown"
+            if aug_tok.linestart and context == 'internal':
+                context = 'unknown'
 
             # Find the case-normalized type of the token.  If it's a
             # sentence-final token, strip off the period.
@@ -912,13 +909,13 @@ class PunktTrainer(PunktBaseClass):
             # Decide whether the next word is at a sentence boundary.
             if aug_tok.sentbreak:
                 if not (aug_tok.is_number or aug_tok.is_initial):
-                    context = "initial"
+                    context = 'initial'
                 else:
-                    context = "unknown"
+                    context = 'unknown'
             elif aug_tok.ellipsis or aug_tok.abbr:
-                context = "unknown"
+                context = 'unknown'
             else:
-                context = "internal"
+                context = 'internal'
 
     # ////////////////////////////////////////////////////////////
     # { Abbreviations
@@ -945,10 +942,10 @@ class PunktTrainer(PunktBaseClass):
         for typ in types:
             # Check some basic conditions, to rule out words that are
             # clearly not abbrev_types.
-            if not _re_non_punct.search(typ) or typ == "##number##":
+            if not _re_non_punct.search(typ) or typ == '##number##':
                 continue
 
-            if typ.endswith("."):
+            if typ.endswith('.'):
                 if typ in self._params.abbrev_types:
                     continue
                 typ = typ[:-1]
@@ -960,7 +957,7 @@ class PunktTrainer(PunktBaseClass):
 
             # Count how many periods & nonperiods are in the
             # candidate.
-            num_periods = typ.count(".") + 1
+            num_periods = typ.count('.') + 1
             num_nonperiods = len(typ) - num_periods + 1
 
             # Let <a> be the candidate without the period, and <b>
@@ -968,7 +965,7 @@ class PunktTrainer(PunktBaseClass):
             # indicates whether <ab> occurs as a single unit (high
             # value of ll), or as two independent units <a> and
             # <b> (low value of ll).
-            count_with_period = self._type_fdist[typ + "."]
+            count_with_period = self._type_fdist[typ + '.']
             count_without_period = self._type_fdist[typ]
             ll = self._dunning_log_likelihood(
                 count_with_period + count_without_period,
@@ -998,7 +995,7 @@ class PunktTrainer(PunktBaseClass):
         This fails to include abbreviations otherwise found as "rare".
         """
         self._params.clear_abbrevs()
-        tokens = (typ for typ in self._type_fdist if typ and typ.endswith("."))
+        tokens = (typ for typ in self._type_fdist if typ and typ.endswith('.'))
         for abbr, score, is_add in self._reclassify_abbrev_types(tokens):
             if score >= self.ABBREV:
                 self._params.abbrev_types.add(abbr)
@@ -1151,8 +1148,8 @@ class PunktTrainer(PunktBaseClass):
                 continue
 
             col_count = self._collocation_fdist[types]
-            typ1_count = self._type_fdist[typ1] + self._type_fdist[typ1 + "."]
-            typ2_count = self._type_fdist[typ2] + self._type_fdist[typ2 + "."]
+            typ1_count = self._type_fdist[typ1] + self._type_fdist[typ1 + '.']
+            typ2_count = self._type_fdist[typ2] + self._type_fdist[typ2 + '.']
             if (
                 typ1_count > 1
                 and typ2_count > 1
@@ -1196,7 +1193,7 @@ class PunktTrainer(PunktBaseClass):
                 continue
 
             typ_at_break_count = self._sent_starter_fdist[typ]
-            typ_count = self._type_fdist[typ] + self._type_fdist[typ + "."]
+            typ_count = self._type_fdist[typ] + self._type_fdist[typ + '.']
             if typ_count < typ_at_break_count:
                 # needed after freq_threshold
                 continue
@@ -1255,7 +1252,7 @@ class PunktSentenceTokenizer(PunktBaseClass, TokenizerI):
         given. Repeated calls to this method destroy previous parameters. For
         incremental training, instantiate a separate PunktTrainer instance.
         """
-        if not isinstance(train_text, str):
+        if not isinstance(train_text, string_types):
             return train_text
         return PunktTrainer(
             train_text, lang_vars=self._lang_vars, token_cls=self._Token
@@ -1280,7 +1277,7 @@ class PunktSentenceTokenizer(PunktBaseClass, TokenizerI):
         """
 
         for match in self._lang_vars.period_context_re().finditer(text):
-            decision_text = match.group() + match.group("after_tok")
+            decision_text = match.group() + match.group('after_tok')
             tokens = self._tokenize_words(decision_text)
             tokens = list(self._annotate_first_pass(tokens))
             while not tokens[0].period_final:
@@ -1328,12 +1325,12 @@ class PunktSentenceTokenizer(PunktBaseClass, TokenizerI):
     def _slices_from_text(self, text):
         last_break = 0
         for match in self._lang_vars.period_context_re().finditer(text):
-            context = match.group() + match.group("after_tok")
+            context = match.group() + match.group('after_tok')
             if self.text_contains_sentbreak(context):
                 yield slice(last_break, match.end())
-                if match.group("next_tok"):
+                if match.group('next_tok'):
                     # next sentence starts after whitespace
-                    last_break = match.start("next_tok")
+                    last_break = match.start('next_tok')
                 else:
                     # next sentence starts at following punctuation
                     last_break = match.end()
@@ -1440,9 +1437,9 @@ class PunktSentenceTokenizer(PunktBaseClass, TokenizerI):
         pos = 0
 
         # A regular expression that finds pieces of whitespace:
-        WS_REGEXP = re.compile(r"\s*")
+        WS_REGEXP = re.compile(r'\s*')
 
-        sentence = ""
+        sentence = ''
         for aug_tok in tokens:
             tok = aug_tok.tok
 
@@ -1456,7 +1453,7 @@ class PunktSentenceTokenizer(PunktBaseClass, TokenizerI):
             # token doesn't match, see if adding whitespace helps.
             # If so, then use the version with whitespace.
             if text[pos : pos + len(tok)] != tok:
-                pat = "\s*".join(re.escape(c) for c in tok)
+                pat = '\s*'.join(re.escape(c) for c in tok)
                 m = re.compile(pat).match(text, pos)
                 if m:
                     tok = m.group()
@@ -1475,7 +1472,7 @@ class PunktSentenceTokenizer(PunktBaseClass, TokenizerI):
             # If we're at a sentence break, then start a new sentence.
             if aug_tok.sentbreak:
                 yield sentence
-                sentence = ""
+                sentence = ''
 
         # If the last sentence is emtpy, discard it.
         if sentence:
@@ -1483,15 +1480,15 @@ class PunktSentenceTokenizer(PunktBaseClass, TokenizerI):
 
     # [XX] TESTING
     def dump(self, tokens):
-        print("writing to /tmp/punkt.new...")
-        with open("/tmp/punkt.new", "w") as outfile:
+        print('writing to /tmp/punkt.new...')
+        with open('/tmp/punkt.new', 'w') as outfile:
             for aug_tok in tokens:
                 if aug_tok.parastart:
-                    outfile.write("\n\n")
+                    outfile.write('\n\n')
                 elif aug_tok.linestart:
-                    outfile.write("\n")
+                    outfile.write('\n')
                 else:
-                    outfile.write(" ")
+                    outfile.write(' ')
 
                 outfile.write(str(aug_tok))
 
@@ -1499,7 +1496,7 @@ class PunktSentenceTokenizer(PunktBaseClass, TokenizerI):
     # { Customization Variables
     # ////////////////////////////////////////////////////////////
 
-    PUNCTUATION = tuple(";:,.!?")
+    PUNCTUATION = tuple(';:,.!?')
 
     # ////////////////////////////////////////////////////////////
     # { Annotation Procedures
@@ -1568,7 +1565,7 @@ class PunktSentenceTokenizer(PunktBaseClass, TokenizerI):
         # [4.3. Token-Based Detection of Initials and Ordinals]
         # Check if any initials or ordinals tokens that are marked
         # as sentbreaks should be reclassified as abbreviations.
-        if tok_is_initial or typ == "##number##":
+        if tok_is_initial or typ == '##number##':
 
             # [4.1.1. Orthographic Heuristic] Check if there's
             # orthogrpahic evidence about whether the next word
@@ -1587,7 +1584,7 @@ class PunktSentenceTokenizer(PunktBaseClass, TokenizerI):
             # heuristc is unknown, and next word is always
             # capitalized, then mark as abbrev (eg: J. Bach).
             if (
-                is_sent_starter == "unknown"
+                is_sent_starter == 'unknown'
                 and tok_is_initial
                 and aug_tok2.first_upper
                 and not (self._params.ortho_context[next_typ] & _ORTHO_LC)
@@ -1628,10 +1625,10 @@ class PunktSentenceTokenizer(PunktBaseClass, TokenizerI):
             return False
 
         # Otherwise, we're not sure.
-        return "unknown"
+        return 'unknown'
 
 
-DEBUG_DECISION_FMT = """Text: %(text)r (at offset %(period_index)d)
+DEBUG_DECISION_FMT = '''Text: %(text)r (at offset %(period_index)d)
 Sentence break? %(break_decision)s (%(reason)s)
 Collocation? %(collocation)s
 %(type1)r:
@@ -1641,7 +1638,7 @@ Collocation? %(collocation)s
     known sentence starter: %(type2_is_sent_starter)s
     orthographic heuristic suggests is a sentence starter? %(type2_ortho_heuristic)s
     orthographic contexts in training: %(type2_ortho_contexts)s
-"""
+'''
 
 
 def format_debug_decision(d):
@@ -1651,7 +1648,7 @@ def format_debug_decision(d):
 def demo(text, tok_cls=PunktSentenceTokenizer, train_cls=PunktTrainer):
     """Builds a punkt model and applies it to the same text"""
     cleanup = (
-        lambda s: re.compile(r"(?:\r|^\s+)", re.MULTILINE).sub("", s).replace("\n", " ")
+        lambda s: re.compile(r'(?:\r|^\s+)', re.MULTILINE).sub('', s).replace('\n', ' ')
     )
     trainer = train_cls()
     trainer.INCLUDE_ALL_COLLOCS = True
index dd4630e..9f7a1ee 100644 (file)
@@ -1,6 +1,6 @@
 # Natural Language Toolkit: Tokenizers
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Edward Loper <edloper@gmail.com>
 #         Steven Bird <stevenbird1@gmail.com>
 #         Trevor Cohn <tacohn@csse.unimelb.edu.au>
@@ -65,13 +65,16 @@ argument.  This differs from the conventions used by Python's
 ``re`` functions, where the pattern is always the first argument.
 (This is for consistency with the other NLTK tokenizers.)
 """
+from __future__ import unicode_literals
 
 import re
 
 from nltk.tokenize.api import TokenizerI
 from nltk.tokenize.util import regexp_span_tokenize
+from nltk.compat import python_2_unicode_compatible
 
 
+@python_2_unicode_compatible
 class RegexpTokenizer(TokenizerI):
     """
     A tokenizer that splits a string using a regular expression, which
@@ -107,7 +110,7 @@ class RegexpTokenizer(TokenizerI):
         flags=re.UNICODE | re.MULTILINE | re.DOTALL,
     ):
         # If they gave us a regexp object, extract the pattern.
-        pattern = getattr(pattern, "pattern", pattern)
+        pattern = getattr(pattern, 'pattern', pattern)
 
         self._pattern = pattern
         self._gaps = gaps
@@ -144,7 +147,7 @@ class RegexpTokenizer(TokenizerI):
                 yield m.span()
 
     def __repr__(self):
-        return "%s(pattern=%r, gaps=%r, discard_empty=%r, flags=%r)" % (
+        return '%s(pattern=%r, gaps=%r, discard_empty=%r, flags=%r)' % (
             self.__class__.__name__,
             self._pattern,
             self._gaps,
@@ -166,7 +169,7 @@ class WhitespaceTokenizer(RegexpTokenizer):
     """
 
     def __init__(self):
-        RegexpTokenizer.__init__(self, r"\s+", gaps=True)
+        RegexpTokenizer.__init__(self, r'\s+', gaps=True)
 
 
 class BlanklineTokenizer(RegexpTokenizer):
@@ -177,7 +180,7 @@ class BlanklineTokenizer(RegexpTokenizer):
     """
 
     def __init__(self):
-        RegexpTokenizer.__init__(self, r"\s*\n\s*\n\s*", gaps=True)
+        RegexpTokenizer.__init__(self, r'\s*\n\s*\n\s*', gaps=True)
 
 
 class WordPunctTokenizer(RegexpTokenizer):
@@ -193,7 +196,7 @@ class WordPunctTokenizer(RegexpTokenizer):
     """
 
     def __init__(self):
-        RegexpTokenizer.__init__(self, r"\w+|[^\w\s]+")
+        RegexpTokenizer.__init__(self, r'\w+|[^\w\s]+')
 
 
 ######################################################################
index 49b5139..2cf7a50 100644 (file)
@@ -8,12 +8,16 @@
 # URL: <http://nltk.org/>
 # For license information, see LICENSE.TXT
 
+from __future__ import unicode_literals, print_function
+
 import os
 import re
 import sys
 import subprocess
 import tempfile
 
+from six import text_type
+
 from nltk.data import ZipFilePathPointer
 from nltk.internals import find_dir
 
@@ -40,20 +44,20 @@ class ReppTokenizer(TokenizerI):
     (u'We', u'evaluated', u'our', u'method', u'on', u'three', u'languages', u'and', u'obtained', u'error', u'rates', u'of', u'0.27', u'%', u'(', u'English', u')', u',', u'0.35', u'%', u'(', u'Dutch', u')', u'and', u'0.76', u'%', u'(', u'Italian', u')', u'for', u'our', u'best', u'models', u'.')
 
     >>> for sent in tokenizer.tokenize_sents(sents): # doctest: +SKIP
-    ...     print(sent)                              # doctest: +SKIP
+    ...     print sent                               # doctest: +SKIP
     ...
     (u'Tokenization', u'is', u'widely', u'regarded', u'as', u'a', u'solved', u'problem', u'due', u'to', u'the', u'high', u'accuracy', u'that', u'rulebased', u'tokenizers', u'achieve', u'.')
     (u'But', u'rule-based', u'tokenizers', u'are', u'hard', u'to', u'maintain', u'and', u'their', u'rules', u'language', u'specific', u'.')
     (u'We', u'evaluated', u'our', u'method', u'on', u'three', u'languages', u'and', u'obtained', u'error', u'rates', u'of', u'0.27', u'%', u'(', u'English', u')', u',', u'0.35', u'%', u'(', u'Dutch', u')', u'and', u'0.76', u'%', u'(', u'Italian', u')', u'for', u'our', u'best', u'models', u'.')
     >>> for sent in tokenizer.tokenize_sents(sents, keep_token_positions=True): # doctest: +SKIP
-    ...     print(sent)                                                         # doctest: +SKIP
+    ...     print sent                                                          # doctest: +SKIP
     ...
     [(u'Tokenization', 0, 12), (u'is', 13, 15), (u'widely', 16, 22), (u'regarded', 23, 31), (u'as', 32, 34), (u'a', 35, 36), (u'solved', 37, 43), (u'problem', 44, 51), (u'due', 52, 55), (u'to', 56, 58), (u'the', 59, 62), (u'high', 63, 67), (u'accuracy', 68, 76), (u'that', 77, 81), (u'rulebased', 82, 91), (u'tokenizers', 92, 102), (u'achieve', 103, 110), (u'.', 110, 111)]
     [(u'But', 0, 3), (u'rule-based', 4, 14), (u'tokenizers', 15, 25), (u'are', 26, 29), (u'hard', 30, 34), (u'to', 35, 37), (u'maintain', 38, 46), (u'and', 47, 50), (u'their', 51, 56), (u'rules', 57, 62), (u'language', 63, 71), (u'specific', 72, 80), (u'.', 80, 81)]
     [(u'We', 0, 2), (u'evaluated', 3, 12), (u'our', 13, 16), (u'method', 17, 23), (u'on', 24, 26), (u'three', 27, 32), (u'languages', 33, 42), (u'and', 43, 46), (u'obtained', 47, 55), (u'error', 56, 61), (u'rates', 62, 67), (u'of', 68, 70), (u'0.27', 71, 75), (u'%', 75, 76), (u'(', 77, 78), (u'English', 78, 85), (u')', 85, 86), (u',', 86, 87), (u'0.35', 88, 92), (u'%', 92, 93), (u'(', 94, 95), (u'Dutch', 95, 100), (u')', 100, 101), (u'and', 102, 105), (u'0.76', 106, 110), (u'%', 110, 111), (u'(', 112, 113), (u'Italian', 113, 120), (u')', 120, 121), (u'for', 122, 125), (u'our', 126, 129), (u'best', 130, 134), (u'models', 135, 141), (u'.', 141, 142)]
     """
 
-    def __init__(self, repp_dir, encoding="utf8"):
+    def __init__(self, repp_dir, encoding='utf8'):
         self.repp_dir = self.find_repptokenizer(repp_dir)
         # Set a directory to store the temporary files.
         self.working_dir = tempfile.gettempdir()
@@ -81,11 +85,11 @@ class ReppTokenizer(TokenizerI):
         :rtype: iter(tuple(str))
         """
         with tempfile.NamedTemporaryFile(
-            prefix="repp_input.", dir=self.working_dir, mode="w", delete=False
+            prefix='repp_input.', dir=self.working_dir, mode='w', delete=False
         ) as input_file:
             # Write sentences to temporary input file.
             for sent in sentences:
-                input_file.write(str(sent) + "\n")
+                input_file.write(text_type(sent) + '\n')
             input_file.close()
             # Generate command to run REPP.
             cmd = self.generate_repp_command(input_file.name)
@@ -104,9 +108,9 @@ class ReppTokenizer(TokenizerI):
         :param inputfilename: path to the input file
         :type inputfilename: str
         """
-        cmd = [self.repp_dir + "/src/repp"]
-        cmd += ["-c", self.repp_dir + "/erg/repp.set"]
-        cmd += ["--format", "triple"]
+        cmd = [self.repp_dir + '/src/repp']
+        cmd += ['-c', self.repp_dir + '/erg/repp.set']
+        cmd += ['--format', 'triple']
         cmd += [inputfilename]
         return cmd
 
@@ -128,8 +132,8 @@ class ReppTokenizer(TokenizerI):
         :return: an iterable of the tokenized sentences as tuples of strings
         :rtype: iter(tuple)
         """
-        line_regex = re.compile("^\((\d+), (\d+), (.+)\)$", re.MULTILINE)
-        for section in repp_output.split("\n\n"):
+        line_regex = re.compile('^\((\d+), (\d+), (.+)\)$', re.MULTILINE)
+        for section in repp_output.split('\n\n'):
             words_with_positions = [
                 (token, int(start), int(end))
                 for start, end, token in line_regex.findall(section)
@@ -144,8 +148,8 @@ class ReppTokenizer(TokenizerI):
         if os.path.exists(repp_dirname):  # If a full path is given.
             _repp_dir = repp_dirname
         else:  # Try to find path to REPP directory in environment variables.
-            _repp_dir = find_dir(repp_dirname, env_vars=("REPP_TOKENIZER",))
+            _repp_dir = find_dir(repp_dirname, env_vars=('REPP_TOKENIZER',))
         # Checks for the REPP binary and erg/repp.set config file.
-        assert os.path.exists(_repp_dir + "/src/repp")
-        assert os.path.exists(_repp_dir + "/erg/repp.set")
+        assert os.path.exists(_repp_dir + '/src/repp')
+        assert os.path.exists(_repp_dir + '/erg/repp.set')
         return _repp_dir
index 9313a94..e2a1dd6 100644 (file)
@@ -1,6 +1,6 @@
 # Natural Language Toolkit: Tokenizers
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Yoav Goldberg <yoavg@cs.bgu.ac.il>
 #         Steven Bird <stevenbird1@gmail.com> (minor edits)
 # URL: <http://nltk.sourceforge.net>
@@ -76,14 +76,14 @@ class SExprTokenizer(TokenizerI):
     :param strict: If true, then raise an exception when tokenizing an ill-formed sexpr.
     """
 
-    def __init__(self, parens="()", strict=True):
+    def __init__(self, parens='()', strict=True):
         if len(parens) != 2:
-            raise ValueError("parens must contain exactly two strings")
+            raise ValueError('parens must contain exactly two strings')
         self._strict = strict
         self._open_paren = parens[0]
         self._close_paren = parens[1]
         self._paren_regexp = re.compile(
-            "%s|%s" % (re.escape(parens[0]), re.escape(parens[1]))
+            '%s|%s' % (re.escape(parens[0]), re.escape(parens[1]))
         )
 
     def tokenize(self, text):
@@ -125,13 +125,13 @@ class SExprTokenizer(TokenizerI):
                 depth += 1
             if paren == self._close_paren:
                 if self._strict and depth == 0:
-                    raise ValueError("Un-matched close paren at char %d" % m.start())
+                    raise ValueError('Un-matched close paren at char %d' % m.start())
                 depth = max(0, depth - 1)
                 if depth == 0:
                     result.append(text[pos : m.end()])
                     pos = m.end()
         if self._strict and depth > 0:
-            raise ValueError("Un-matched open paren at char %d" % pos)
+            raise ValueError('Un-matched open paren at char %d' % pos)
         if pos < len(text):
             result.append(text[pos:])
         return result
index ac1e400..c467678 100644 (file)
@@ -1,6 +1,6 @@
 # Natural Language Toolkit: Simple Tokenizers
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Edward Loper <edloper@gmail.com>
 #         Steven Bird <stevenbird1@gmail.com>
 # URL: <http://nltk.sourceforge.net>
@@ -34,7 +34,7 @@ that expects a tokenizer.  For example, these tokenizers can be used
 to specify the tokenization conventions when building a `CorpusReader`.
 
 """
-
+from __future__ import unicode_literals
 from nltk.tokenize.api import TokenizerI, StringTokenizer
 from nltk.tokenize.util import string_span_tokenize, regexp_span_tokenize
 
@@ -50,7 +50,7 @@ class SpaceTokenizer(StringTokenizer):
         'Please', 'buy', 'me\ntwo', 'of', 'them.\n\nThanks.']
     """
 
-    _string = " "
+    _string = ' '
 
 
 class TabTokenizer(StringTokenizer):
@@ -62,7 +62,7 @@ class TabTokenizer(StringTokenizer):
         ['a', 'b c\n', ' d']
     """
 
-    _string = "\t"
+    _string = '\t'
 
 
 class CharTokenizer(StringTokenizer):
@@ -101,11 +101,11 @@ class LineTokenizer(TokenizerI):
            a corresponding token ``''`` after that newline.
     """
 
-    def __init__(self, blanklines="discard"):
-        valid_blanklines = ("discard", "keep", "discard-eof")
+    def __init__(self, blanklines='discard'):
+        valid_blanklines = ('discard', 'keep', 'discard-eof')
         if blanklines not in valid_blanklines:
             raise ValueError(
-                "Blank lines must be one of: %s" % " ".join(valid_blanklines)
+                'Blank lines must be one of: %s' % ' '.join(valid_blanklines)
             )
 
         self._blanklines = blanklines
@@ -113,20 +113,20 @@ class LineTokenizer(TokenizerI):
     def tokenize(self, s):
         lines = s.splitlines()
         # If requested, strip off blank lines.
-        if self._blanklines == "discard":
+        if self._blanklines == 'discard':
             lines = [l for l in lines if l.rstrip()]
-        elif self._blanklines == "discard-eof":
+        elif self._blanklines == 'discard-eof':
             if lines and not lines[-1].strip():
                 lines.pop()
         return lines
 
     # discard-eof not implemented
     def span_tokenize(self, s):
-        if self._blanklines == "keep":
-            for span in string_span_tokenize(s, r"\n"):
+        if self._blanklines == 'keep':
+            for span in string_span_tokenize(s, r'\n'):
                 yield span
         else:
-            for span in regexp_span_tokenize(s, r"\n(\s+\n)*"):
+            for span in regexp_span_tokenize(s, r'\n(\s+\n)*'):
                 yield span
 
 
@@ -136,5 +136,5 @@ class LineTokenizer(TokenizerI):
 # XXX: it is stated in module docs that there is no function versions
 
 
-def line_tokenize(text, blanklines="discard"):
+def line_tokenize(text, blanklines='discard'):
     return LineTokenizer(blanklines).tokenize(text)
diff --git a/nlp_resource_data/nltk/tokenize/sonority_sequencing.py b/nlp_resource_data/nltk/tokenize/sonority_sequencing.py
deleted file mode 100644 (file)
index fb6b080..0000000
+++ /dev/null
@@ -1,192 +0,0 @@
-# Natural Language Toolkit: Tokenizers
-#
-# Copyright (C) 2001-2020 NLTK Project
-# Author: Christopher Hench <chris.l.hench@gmail.com>
-#         Alex Estes
-# URL: <http://nltk.sourceforge.net>
-# For license information, see LICENSE.TXT
-
-"""
-The Sonority Sequencing Principle (SSP) is a language agnostic algorithm proposed
-by Otto Jesperson in 1904. The sonorous quality of a phoneme is judged by the
-openness of the lips. Syllable breaks occur before troughs in sonority. For more
-on the SSP see Selkirk (1984).
-
-The default implementation uses the English alphabet, but the `sonority_hiearchy`
-can be modified to IPA or any other alphabet for the use-case. The SSP is a
-universal syllabification algorithm, but that does not mean it performs equally
-across languages. Bartlett et al. (2009) is a good benchmark for English accuracy
-if utilizing IPA (pg. 311).
-
-Importantly, if a custom hiearchy is supplied and vowels span across more than
-one level, they should be given separately to the `vowels` class attribute.
-
-References:
-- Otto Jespersen. 1904. Lehrbuch der Phonetik.
-  Leipzig, Teubner. Chapter 13, Silbe, pp. 185-203.
-- Elisabeth Selkirk. 1984. On the major class features and syllable theory.
-  In Aronoff & Oehrle (eds.) Language Sound Structure: Studies in Phonology.
-  Cambridge, MIT Press. pp. 107-136.
-- Susan Bartlett, et al. 2009. On the Syllabification of Phonemes.
-  In HLT-NAACL. pp. 308-316.
-"""
-
-import warnings
-
-import re
-from string import punctuation
-
-from nltk.tokenize.api import TokenizerI
-from nltk.util import ngrams
-
-
-class SyllableTokenizer(TokenizerI):
-    """
-    Syllabifies words based on the Sonority Sequencing Principle (SSP).
-
-        >>> from nltk.tokenize import SyllableTokenizer
-        >>> from nltk import word_tokenize
-        >>> SSP = SyllableTokenizer()
-        >>> SSP.tokenize('justification')
-        ['jus', 'ti', 'fi', 'ca', 'tion']
-        >>> text = "This is a foobar-like sentence."
-        >>> [SSP.tokenize(token) for token in word_tokenize(text)]
-        [['This'], ['is'], ['a'], ['foo', 'bar', '-', 'li', 'ke'], ['sen', 'ten', 'ce'], ['.']]
-    """
-
-    def __init__(self, lang="en", sonority_hierarchy=False):
-        """
-        :param lang: Language parameter, default is English, 'en'
-        :type lang: str
-        :param sonority_hierarchy: Sonority hierarchy according to the
-                                   Sonority Sequencing Principle.
-        :type sonority_hierarchy: list(str)
-        """
-        # Sonority hierarchy should be provided in descending order.
-        # If vowels are spread across multiple levels, they should be
-        # passed assigned self.vowels var together, otherwise should be
-        # placed in first index of hierarchy.
-        if not sonority_hierarchy and lang == "en":
-            sonority_hierarchy = [
-                "aeiouy",  # vowels.
-                "lmnrw",  # nasals.
-                "zvsf",  # fricatives.
-                "bcdgtkpqxhj",  # stops.
-            ]
-
-        self.vowels = sonority_hierarchy[0]
-        self.phoneme_map = {}
-        for i, level in enumerate(sonority_hierarchy):
-            for c in level:
-                sonority_level = len(sonority_hierarchy) - i
-                self.phoneme_map[c] = sonority_level
-                self.phoneme_map[c.upper()] = sonority_level
-
-    def assign_values(self, token):
-        """
-        Assigns each phoneme its value from the sonority hierarchy.
-        Note: Sentence/text has to be tokenized first.
-
-        :param token: Single word or token
-        :type token: str
-        :return: List of tuples, first element is character/phoneme and
-                 second is the soronity value.
-        :rtype: list(tuple(str, int))
-        """
-        syllables_values = []
-        for c in token:
-            try:
-                syllables_values.append((c, self.phoneme_map[c]))
-            except KeyError:
-                if c not in punctuation:
-                    warnings.warn(
-                        "Character not defined in sonority_hierarchy,"
-                        " assigning as vowel: '{}'".format(c)
-                    )
-                    syllables_values.append((c, max(self.phoneme_map.values())))
-                    self.vowels += c
-                else:  # If it's a punctuation, assing -1.
-                    syllables_values.append((c, -1))
-        return syllables_values
-
-    def validate_syllables(self, syllable_list):
-        """
-        Ensures each syllable has at least one vowel.
-        If the following syllable doesn't have vowel, add it to the current one.
-
-        :param syllable_list: Single word or token broken up into syllables.
-        :type syllable_list: list(str)
-        :return: Single word or token broken up into syllables
-                 (with added syllables if necessary)
-        :rtype: list(str)
-        """
-        valid_syllables = []
-        front = ""
-        for i, syllable in enumerate(syllable_list):
-            if syllable in punctuation:
-                valid_syllables.append(syllable)
-                continue
-            if not re.search("|".join(self.vowels), syllable):
-                if len(valid_syllables) == 0:
-                    front += syllable
-                else:
-                    valid_syllables = valid_syllables[:-1] + [
-                        valid_syllables[-1] + syllable
-                    ]
-            else:
-                if len(valid_syllables) == 0:
-                    valid_syllables.append(front + syllable)
-                else:
-                    valid_syllables.append(syllable)
-
-        return valid_syllables
-
-    def tokenize(self, token):
-        """
-        Apply the SSP to return a list of syllables.
-        Note: Sentence/text has to be tokenized first.
-
-        :param token: Single word or token
-        :type token: str
-        :return syllable_list: Single word or token broken up into syllables.
-        :rtype: list(str)
-        """
-        # assign values from hierarchy
-        syllables_values = self.assign_values(token)
-
-        # if only one vowel return word
-        if sum(token.count(x) for x in self.vowels) <= 1:
-            return [token]
-
-        syllable_list = []
-        syllable = syllables_values[0][0]  # start syllable with first phoneme
-        for trigram in ngrams(syllables_values, n=3):
-            phonemes, values = zip(*trigram)
-            # Sonority of previous, focal and following phoneme
-            prev_value, focal_value, next_value = values
-            # Focal phoneme.
-            focal_phoneme = phonemes[1]
-
-            # These cases trigger syllable break.
-            if focal_value == -1:  # If it's a punctuation, just break.
-                syllable_list.append(syllable)
-                syllable_list.append(focal_phoneme)
-                syllable = ""
-            elif prev_value >= focal_value == next_value:
-                syllable += focal_phoneme
-                syllable_list.append(syllable)
-                syllable = ""
-
-            elif prev_value > focal_value < next_value:
-                syllable_list.append(syllable)
-                syllable = ""
-                syllable += focal_phoneme
-
-            # no syllable break
-            else:
-                syllable += focal_phoneme
-
-        syllable += syllables_values[-1][0]  # append last phoneme
-        syllable_list.append(syllable)
-
-        return self.validate_syllables(syllable_list)
index b17f591..93fb219 100644 (file)
@@ -1,23 +1,27 @@
 # -*- coding: utf-8 -*-
 # Natural Language Toolkit: Interface to the Stanford Tokenizer
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Steven Xu <xxu@student.unimelb.edu.au>
 #
 # URL: <http://nltk.org/>
 # For license information, see LICENSE.TXT
 
+from __future__ import unicode_literals, print_function
+
 import tempfile
 import os
 import json
 from subprocess import PIPE
 import warnings
 
+from six import text_type
+
 from nltk.internals import find_jar, config_java, java, _java_options
 from nltk.tokenize.api import TokenizerI
 from nltk.parse.corenlp import CoreNLPParser
 
-_stanford_url = "https://nlp.stanford.edu/software/tokenizer.shtml"
+_stanford_url = 'https://nlp.stanford.edu/software/tokenizer.shtml'
 
 
 class StanfordTokenizer(TokenizerI):
@@ -33,15 +37,15 @@ class StanfordTokenizer(TokenizerI):
     ['The', 'color', 'of', 'the', 'wall', 'is', 'blue', '.']
     """
 
-    _JAR = "stanford-postagger.jar"
+    _JAR = 'stanford-postagger.jar'
 
     def __init__(
         self,
         path_to_jar=None,
-        encoding="utf8",
+        encoding='utf8',
         options=None,
         verbose=False,
-        java_options="-mx1000m",
+        java_options='-mx1000m',
     ):
         # Raise deprecation warning.
         warnings.warn(
@@ -57,7 +61,7 @@ class StanfordTokenizer(TokenizerI):
         self._stanford_jar = find_jar(
             self._JAR,
             path_to_jar,
-            env_vars=("STANFORD_POSTAGGER",),
+            env_vars=('STANFORD_POSTAGGER',),
             searchpath=(),
             url=_stanford_url,
             verbose=verbose,
@@ -67,8 +71,8 @@ class StanfordTokenizer(TokenizerI):
         self.java_options = java_options
 
         options = {} if options is None else options
-        self._options_cmd = ",".join(
-            "{0}={1}".format(key, val) for key, val in options.items()
+        self._options_cmd = ','.join(
+            '{0}={1}'.format(key, val) for key, val in options.items()
         )
 
     @staticmethod
@@ -79,25 +83,25 @@ class StanfordTokenizer(TokenizerI):
         """
         Use stanford tokenizer's PTBTokenizer to tokenize multiple sentences.
         """
-        cmd = ["edu.stanford.nlp.process.PTBTokenizer"]
+        cmd = ['edu.stanford.nlp.process.PTBTokenizer']
         return self._parse_tokenized_output(self._execute(cmd, s))
 
     def _execute(self, cmd, input_, verbose=False):
         encoding = self._encoding
-        cmd.extend(["-charset", encoding])
+        cmd.extend(['-charset', encoding])
         _options_cmd = self._options_cmd
         if _options_cmd:
-            cmd.extend(["-options", self._options_cmd])
+            cmd.extend(['-options', self._options_cmd])
 
-        default_options = " ".join(_java_options)
+        default_options = ' '.join(_java_options)
 
         # Configure java.
         config_java(options=self.java_options, verbose=verbose)
 
         # Windows is incompatible with NamedTemporaryFile() without passing in delete=False.
-        with tempfile.NamedTemporaryFile(mode="wb", delete=False) as input_file:
+        with tempfile.NamedTemporaryFile(mode='wb', delete=False) as input_file:
             # Write the actual sentences to the temporary input file
-            if isinstance(input_, str) and encoding:
+            if isinstance(input_, text_type) and encoding:
                 input_ = input_.encode(encoding)
             input_file.write(input_)
             input_file.flush()
@@ -125,5 +129,5 @@ def setup_module(module):
         StanfordTokenizer()
     except LookupError:
         raise SkipTest(
-            "doctests from nltk.tokenize.stanford are skipped because the stanford postagger jar doesn't exist"
+            'doctests from nltk.tokenize.stanford are skipped because the stanford postagger jar doesn\'t exist'
         )
index 2595945..858c4d8 100644 (file)
@@ -3,7 +3,7 @@
 # Natural Language Toolkit: Interface to the Stanford Segmenter
 # for Chinese and Arabic
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: 52nlp <52nlpcn@gmail.com>
 #         Casper Lehmann-Strøm <casperlehmann@gmail.com>
 #         Alex Constantin <alex@keyworder.ch>
 # URL: <http://nltk.org/>
 # For license information, see LICENSE.TXT
 
+from __future__ import unicode_literals, print_function
+
 import tempfile
 import os
 import json
 import warnings
 from subprocess import PIPE
 
+from six import text_type
+
+from nltk import compat
 from nltk.internals import (
     find_jar,
     find_file,
@@ -28,7 +33,7 @@ from nltk.internals import (
 from nltk.tokenize.api import TokenizerI
 
 
-_stanford_url = "https://nlp.stanford.edu/software"
+_stanford_url = 'https://nlp.stanford.edu/software'
 
 
 class StanfordSegmenter(TokenizerI):
@@ -53,7 +58,7 @@ class StanfordSegmenter(TokenizerI):
     <BLANKLINE>
     """
 
-    _JAR = "stanford-segmenter.jar"
+    _JAR = 'stanford-segmenter.jar'
 
     def __init__(
         self,
@@ -63,15 +68,15 @@ class StanfordSegmenter(TokenizerI):
         path_to_model=None,
         path_to_dict=None,
         path_to_sihan_corpora_dict=None,
-        sihan_post_processing="false",
-        keep_whitespaces="false",
-        encoding="UTF-8",
+        sihan_post_processing='false',
+        keep_whitespaces='false',
+        encoding='UTF-8',
         options=None,
         verbose=False,
-        java_options="-mx2g",
+        java_options='-mx2g',
     ):
         # Raise deprecation warning.
-        warnings.simplefilter("always", DeprecationWarning)
+        warnings.simplefilter('always', DeprecationWarning)
         warnings.warn(
             str(
                 "\nThe StanfordTokenizer will "
@@ -81,21 +86,21 @@ class StanfordSegmenter(TokenizerI):
             DeprecationWarning,
             stacklevel=2,
         )
-        warnings.simplefilter("ignore", DeprecationWarning)
+        warnings.simplefilter('ignore', DeprecationWarning)
 
         stanford_segmenter = find_jar(
             self._JAR,
             path_to_jar,
-            env_vars=("STANFORD_SEGMENTER",),
+            env_vars=('STANFORD_SEGMENTER',),
             searchpath=(),
             url=_stanford_url,
             verbose=verbose,
         )
         if path_to_slf4j is not None:
             slf4j = find_jar(
-                "slf4j-api.jar",
+                'slf4j-api.jar',
                 path_to_slf4j,
-                env_vars=("SLF4J", "STANFORD_SEGMENTER"),
+                env_vars=('SLF4J', 'STANFORD_SEGMENTER'),
                 searchpath=(),
                 url=_stanford_url,
                 verbose=verbose,
@@ -119,8 +124,8 @@ class StanfordSegmenter(TokenizerI):
         self._encoding = encoding
         self.java_options = java_options
         options = {} if options is None else options
-        self._options_cmd = ",".join(
-            "{0}={1}".format(key, json.dumps(val)) for key, val in options.items()
+        self._options_cmd = ','.join(
+            '{0}={1}'.format(key, json.dumps(val)) for key, val in options.items()
         )
 
     def default_config(self, lang):
@@ -130,33 +135,33 @@ class StanfordSegmenter(TokenizerI):
         """
 
         search_path = ()
-        if os.environ.get("STANFORD_SEGMENTER"):
-            search_path = {os.path.join(os.environ.get("STANFORD_SEGMENTER"), "data")}
+        if os.environ.get('STANFORD_SEGMENTER'):
+            search_path = {os.path.join(os.environ.get('STANFORD_SEGMENTER'), 'data')}
 
         # init for Chinese-specific files
         self._dict = None
         self._sihan_corpora_dict = None
-        self._sihan_post_processing = "false"
+        self._sihan_post_processing = 'false'
 
-        if lang == "ar":
+        if lang == 'ar':
             self._java_class = (
-                "edu.stanford.nlp.international.arabic.process.ArabicSegmenter"
+                'edu.stanford.nlp.international.arabic.process.ArabicSegmenter'
             )
-            model = "arabic-segmenter-atb+bn+arztrain.ser.gz"
+            model = 'arabic-segmenter-atb+bn+arztrain.ser.gz'
 
-        elif lang == "zh":
-            self._java_class = "edu.stanford.nlp.ie.crf.CRFClassifier"
-            model = "pku.gz"
-            self._sihan_post_processing = "true"
+        elif lang == 'zh':
+            self._java_class = 'edu.stanford.nlp.ie.crf.CRFClassifier'
+            model = 'pku.gz'
+            self._sihan_post_processing = 'true'
 
-            path_to_dict = "dict-chris6.ser.gz"
+            path_to_dict = 'dict-chris6.ser.gz'
             try:
                 self._dict = find_file(
                     path_to_dict,
                     searchpath=search_path,
                     url=_stanford_url,
                     verbose=False,
-                    env_vars=("STANFORD_MODELS",),
+                    env_vars=('STANFORD_MODELS',),
                 )
             except LookupError:
                 raise LookupError(
@@ -165,13 +170,13 @@ class StanfordSegmenter(TokenizerI):
                     % path_to_dict
                 )
 
-            sihan_dir = "./data/"
+            sihan_dir = './data/'
             try:
                 path_to_sihan_dir = find_dir(
                     sihan_dir,
                     url=_stanford_url,
                     verbose=False,
-                    env_vars=("STANFORD_SEGMENTER",),
+                    env_vars=('STANFORD_SEGMENTER',),
                 )
                 self._sihan_corpora_dict = os.path.join(path_to_sihan_dir, sihan_dir)
             except LookupError:
@@ -188,7 +193,7 @@ class StanfordSegmenter(TokenizerI):
                 searchpath=search_path,
                 url=_stanford_url,
                 verbose=False,
-                env_vars=("STANFORD_MODELS", "STANFORD_SEGMENTER"),
+                env_vars=('STANFORD_MODELS', 'STANFORD_SEGMENTER'),
             )
         except LookupError:
             raise LookupError(
@@ -204,21 +209,21 @@ class StanfordSegmenter(TokenizerI):
         """
         cmd = [
             self._java_class,
-            "-loadClassifier",
+            '-loadClassifier',
             self._model,
-            "-keepAllWhitespaces",
+            '-keepAllWhitespaces',
             self._keep_whitespaces,
-            "-textFile",
+            '-textFile',
             input_file_path,
         ]
         if self._sihan_corpora_dict is not None:
             cmd.extend(
                 [
-                    "-serDictionary",
+                    '-serDictionary',
                     self._dict,
-                    "-sighanCorporaDict",
+                    '-sighanCorporaDict',
                     self._sihan_corpora_dict,
-                    "-sighanPostProcessing",
+                    '-sighanPostProcessing',
                     self._sihan_post_processing,
                 ]
             )
@@ -238,30 +243,30 @@ class StanfordSegmenter(TokenizerI):
         _input_fh, self._input_file_path = tempfile.mkstemp(text=True)
 
         # Write the actural sentences to the temporary input file
-        _input_fh = os.fdopen(_input_fh, "wb")
-        _input = "\n".join((" ".join(x) for x in sentences))
-        if isinstance(_input, str) and encoding:
+        _input_fh = os.fdopen(_input_fh, 'wb')
+        _input = '\n'.join((' '.join(x) for x in sentences))
+        if isinstance(_input, text_type) and encoding:
             _input = _input.encode(encoding)
         _input_fh.write(_input)
         _input_fh.close()
 
         cmd = [
             self._java_class,
-            "-loadClassifier",
+            '-loadClassifier',
             self._model,
-            "-keepAllWhitespaces",
+            '-keepAllWhitespaces',
             self._keep_whitespaces,
-            "-textFile",
+            '-textFile',
             self._input_file_path,
         ]
         if self._sihan_corpora_dict is not None:
             cmd.extend(
                 [
-                    "-serDictionary",
+                    '-serDictionary',
                     self._dict,
-                    "-sighanCorporaDict",
+                    '-sighanCorporaDict',
                     self._sihan_corpora_dict,
-                    "-sighanPostProcessing",
+                    '-sighanPostProcessing',
                     self._sihan_post_processing,
                 ]
             )
@@ -275,12 +280,12 @@ class StanfordSegmenter(TokenizerI):
 
     def _execute(self, cmd, verbose=False):
         encoding = self._encoding
-        cmd.extend(["-inputEncoding", encoding])
+        cmd.extend(['-inputEncoding', encoding])
         _options_cmd = self._options_cmd
         if _options_cmd:
-            cmd.extend(["-options", self._options_cmd])
+            cmd.extend(['-options', self._options_cmd])
 
-        default_options = " ".join(_java_options)
+        default_options = ' '.join(_java_options)
 
         # Configure java.
         config_java(options=self.java_options, verbose=verbose)
@@ -301,9 +306,9 @@ def setup_module(module):
 
     try:
         seg = StanfordSegmenter()
-        seg.default_config("ar")
-        seg.default_config("zh")
+        seg.default_config('ar')
+        seg.default_config('zh')
     except LookupError as e:
         raise SkipTest(
-            "Tests for nltk.tokenize.stanford_segmenter skipped: %s" % str(e)
+            'Tests for nltk.tokenize.stanford_segmenter skipped: %s' % str(e)
         )
index dbcc980..83da7bf 100644 (file)
@@ -1,6 +1,6 @@
 # Natural Language Toolkit: TextTiling
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: George Boutsioukis
 #
 # URL: <http://nltk.org/>
@@ -77,9 +77,9 @@ class TextTilingTokenizer(TokenizerI):
         if stopwords is None:
             from nltk.corpus import stopwords
 
-            stopwords = stopwords.words("english")
+            stopwords = stopwords.words('english')
         self.__dict__.update(locals())
-        del self.__dict__["self"]
+        del self.__dict__['self']
 
     def tokenize(self, text):
         """Return a tokenized copy of *text*, where each "token" represents
@@ -92,8 +92,8 @@ class TextTilingTokenizer(TokenizerI):
         # Tokenization step starts here
 
         # Remove punctuation
-        nopunct_text = "".join(
-            c for c in lowercase_text if re.match("[a-z\-' \n\t]", c)
+        nopunct_text = ''.join(
+            c for c in lowercase_text if re.match("[a-z\-\' \n\t]", c)
         )
         nopunct_par_breaks = self._mark_paragraph_breaks(nopunct_text)
 
@@ -392,7 +392,7 @@ class TokenTableField(object):
         last_tok_seq=None,
     ):
         self.__dict__.update(locals())
-        del self.__dict__["self"]
+        del self.__dict__['self']
 
 
 class TokenSequence(object):
@@ -401,11 +401,11 @@ class TokenSequence(object):
     def __init__(self, index, wrdindex_list, original_length=None):
         original_length = original_length or len(wrdindex_list)
         self.__dict__.update(locals())
-        del self.__dict__["self"]
+        del self.__dict__['self']
 
 
 # Pasted from the SciPy cookbook: http://www.scipy.org/Cookbook/SignalSmooth
-def smooth(x, window_len=11, window="flat"):
+def smooth(x, window_len=11, window='flat'):
     """smooth the data using a window with requested size.
 
     This method is based on the convolution of a scaled window with the signal.
@@ -441,7 +441,7 @@ def smooth(x, window_len=11, window="flat"):
     if window_len < 3:
         return x
 
-    if window not in ["flat", "hanning", "hamming", "bartlett", "blackman"]:
+    if window not in ['flat', 'hanning', 'hamming', 'bartlett', 'blackman']:
         raise ValueError(
             "Window is on of 'flat', 'hanning', 'hamming', 'bartlett', 'blackman'"
         )
@@ -449,12 +449,12 @@ def smooth(x, window_len=11, window="flat"):
     s = numpy.r_[2 * x[0] - x[window_len:1:-1], x, 2 * x[-1] - x[-1:-window_len:-1]]
 
     # print(len(s))
-    if window == "flat":  # moving average
-        w = numpy.ones(window_len, "d")
+    if window == 'flat':  # moving average
+        w = numpy.ones(window_len, 'd')
     else:
-        w = eval("numpy." + window + "(window_len)")
+        w = eval('numpy.' + window + '(window_len)')
 
-    y = numpy.convolve(w / w.sum(), s, mode="same")
+    y = numpy.convolve(w / w.sum(), s, mode='same')
 
     return y[window_len - 1 : -window_len + 1]
 
index 0c595b2..9779725 100644 (file)
@@ -22,6 +22,7 @@ Model (Doctoral dissertation). Columbus, OH, USA: The Ohio State University.
 """
 
 import re
+from six import text_type
 
 from nltk.tokenize.api import TokenizerI
 
@@ -33,10 +34,10 @@ class ToktokTokenizer(TokenizerI):
 
     >>> toktok = ToktokTokenizer()
     >>> text = u'Is 9.5 or 525,600 my favorite number?'
-    >>> print(toktok.tokenize(text, return_str=True))
+    >>> print (toktok.tokenize(text, return_str=True))
     Is 9.5 or 525,600 my favorite number ?
     >>> text = u'The https://github.com/jonsafari/tok-tok/blob/master/tok-tok.pl is a website with/and/or slashes and sort of weird : things'
-    >>> print(toktok.tokenize(text, return_str=True))
+    >>> print (toktok.tokenize(text, return_str=True))
     The https://github.com/jonsafari/tok-tok/blob/master/tok-tok.pl is a website with/and/or slashes and sort of weird : things
     >>> text = u'\xa1This, is a sentence with weird\xbb symbols\u2026 appearing everywhere\xbf'
     >>> expected = u'\xa1 This , is a sentence with weird \xbb symbols \u2026 appearing everywhere \xbf'
@@ -51,20 +52,20 @@ class ToktokTokenizer(TokenizerI):
     # Pad some funky punctuation.
     FUNKY_PUNCT_1 = re.compile(u'([،;؛¿!"\])}»›”؟¡%٪°±©®।॥…])'), r" \1 "
     # Pad more funky punctuation.
-    FUNKY_PUNCT_2 = re.compile(u"([({\[“‘„‚«‹「『])"), r" \1 "
+    FUNKY_PUNCT_2 = re.compile(u'([({\[“‘„‚«‹「『])'), r" \1 "
     # Pad En dash and em dash
-    EN_EM_DASHES = re.compile(u"([–—])"), r" \1 "
+    EN_EM_DASHES = re.compile(u'([–—])'), r" \1 "
 
     # Replace problematic character with numeric character reference.
-    AMPERCENT = re.compile("& "), "&amp; "
-    TAB = re.compile("\t"), " &#9; "
-    PIPE = re.compile("\|"), " &#124; "
+    AMPERCENT = re.compile('& '), '&amp; '
+    TAB = re.compile('\t'), ' &#9; '
+    PIPE = re.compile('\|'), ' &#124; '
 
     # Pad numbers with commas to keep them from further tokenization.
-    COMMA_IN_NUM = re.compile(r"(?<!,)([,،])(?![,\d])"), r" \1 "
+    COMMA_IN_NUM = re.compile(r'(?<!,)([,،])(?![,\d])'), r' \1 '
 
     # Just pad problematic (often neurotic) hyphen/single quote, etc.
-    PROB_SINGLE_QUOTES = re.compile(r"(['’`])"), r" \1 "
+    PROB_SINGLE_QUOTES = re.compile(r"(['’`])"), r' \1 '
     # Group ` ` stupid quotes ' ' into a single token.
     STUPID_QUOTES_1 = re.compile(r" ` ` "), r" `` "
     STUPID_QUOTES_2 = re.compile(r" ' ' "), r" '' "
@@ -79,69 +80,69 @@ class ToktokTokenizer(TokenizerI):
     FINAL_PERIOD_2 = re.compile(r"""(?<!\.)\.\s*(["'’»›”]) *$"""), r" . \1"
 
     # Treat continuous commas as fake German,Czech, etc.: „
-    MULTI_COMMAS = re.compile(r"(,{2,})"), r" \1 "
+    MULTI_COMMAS = re.compile(r'(,{2,})'), r' \1 '
     # Treat continuous dashes as fake en-dash, etc.
-    MULTI_DASHES = re.compile(r"(-{2,})"), r" \1 "
+    MULTI_DASHES = re.compile(r'(-{2,})'), r' \1 '
     # Treat multiple periods as a thing (eg. ellipsis)
-    MULTI_DOTS = re.compile(r"(\.{2,})"), r" \1 "
+    MULTI_DOTS = re.compile(r'(\.{2,})'), r' \1 '
 
     # This is the \p{Open_Punctuation} from Perl's perluniprops
     # see http://perldoc.perl.org/perluniprops.html
-    OPEN_PUNCT = str(
-        u"([{\u0f3a\u0f3c\u169b\u201a\u201e\u2045\u207d"
-        u"\u208d\u2329\u2768\u276a\u276c\u276e\u2770\u2772"
-        u"\u2774\u27c5\u27e6\u27e8\u27ea\u27ec\u27ee\u2983"
-        u"\u2985\u2987\u2989\u298b\u298d\u298f\u2991\u2993"
-        u"\u2995\u2997\u29d8\u29da\u29fc\u2e22\u2e24\u2e26"
-        u"\u2e28\u3008\u300a\u300c\u300e\u3010\u3014\u3016"
-        u"\u3018\u301a\u301d\ufd3e\ufe17\ufe35\ufe37\ufe39"
-        u"\ufe3b\ufe3d\ufe3f\ufe41\ufe43\ufe47\ufe59\ufe5b"
-        u"\ufe5d\uff08\uff3b\uff5b\uff5f\uff62"
+    OPEN_PUNCT = text_type(
+        u'([{\u0f3a\u0f3c\u169b\u201a\u201e\u2045\u207d'
+        u'\u208d\u2329\u2768\u276a\u276c\u276e\u2770\u2772'
+        u'\u2774\u27c5\u27e6\u27e8\u27ea\u27ec\u27ee\u2983'
+        u'\u2985\u2987\u2989\u298b\u298d\u298f\u2991\u2993'
+        u'\u2995\u2997\u29d8\u29da\u29fc\u2e22\u2e24\u2e26'
+        u'\u2e28\u3008\u300a\u300c\u300e\u3010\u3014\u3016'
+        u'\u3018\u301a\u301d\ufd3e\ufe17\ufe35\ufe37\ufe39'
+        u'\ufe3b\ufe3d\ufe3f\ufe41\ufe43\ufe47\ufe59\ufe5b'
+        u'\ufe5d\uff08\uff3b\uff5b\uff5f\uff62'
     )
     # This is the \p{Close_Punctuation} from Perl's perluniprops
-    CLOSE_PUNCT = str(
-        u")]}\u0f3b\u0f3d\u169c\u2046\u207e\u208e\u232a"
-        u"\u2769\u276b\u276d\u276f\u2771\u2773\u2775\u27c6"
-        u"\u27e7\u27e9\u27eb\u27ed\u27ef\u2984\u2986\u2988"
-        u"\u298a\u298c\u298e\u2990\u2992\u2994\u2996\u2998"
-        u"\u29d9\u29db\u29fd\u2e23\u2e25\u2e27\u2e29\u3009"
-        u"\u300b\u300d\u300f\u3011\u3015\u3017\u3019\u301b"
-        u"\u301e\u301f\ufd3f\ufe18\ufe36\ufe38\ufe3a\ufe3c"
-        u"\ufe3e\ufe40\ufe42\ufe44\ufe48\ufe5a\ufe5c\ufe5e"
-        u"\uff09\uff3d\uff5d\uff60\uff63"
+    CLOSE_PUNCT = text_type(
+        u')]}\u0f3b\u0f3d\u169c\u2046\u207e\u208e\u232a'
+        u'\u2769\u276b\u276d\u276f\u2771\u2773\u2775\u27c6'
+        u'\u27e7\u27e9\u27eb\u27ed\u27ef\u2984\u2986\u2988'
+        u'\u298a\u298c\u298e\u2990\u2992\u2994\u2996\u2998'
+        u'\u29d9\u29db\u29fd\u2e23\u2e25\u2e27\u2e29\u3009'
+        u'\u300b\u300d\u300f\u3011\u3015\u3017\u3019\u301b'
+        u'\u301e\u301f\ufd3f\ufe18\ufe36\ufe38\ufe3a\ufe3c'
+        u'\ufe3e\ufe40\ufe42\ufe44\ufe48\ufe5a\ufe5c\ufe5e'
+        u'\uff09\uff3d\uff5d\uff60\uff63'
     )
     # This is the \p{Close_Punctuation} from Perl's perluniprops
-    CURRENCY_SYM = str(
-        u"$\xa2\xa3\xa4\xa5\u058f\u060b\u09f2\u09f3\u09fb"
-        u"\u0af1\u0bf9\u0e3f\u17db\u20a0\u20a1\u20a2\u20a3"
-        u"\u20a4\u20a5\u20a6\u20a7\u20a8\u20a9\u20aa\u20ab"
-        u"\u20ac\u20ad\u20ae\u20af\u20b0\u20b1\u20b2\u20b3"
-        u"\u20b4\u20b5\u20b6\u20b7\u20b8\u20b9\u20ba\ua838"
-        u"\ufdfc\ufe69\uff04\uffe0\uffe1\uffe5\uffe6"
+    CURRENCY_SYM = text_type(
+        u'$\xa2\xa3\xa4\xa5\u058f\u060b\u09f2\u09f3\u09fb'
+        u'\u0af1\u0bf9\u0e3f\u17db\u20a0\u20a1\u20a2\u20a3'
+        u'\u20a4\u20a5\u20a6\u20a7\u20a8\u20a9\u20aa\u20ab'
+        u'\u20ac\u20ad\u20ae\u20af\u20b0\u20b1\u20b2\u20b3'
+        u'\u20b4\u20b5\u20b6\u20b7\u20b8\u20b9\u20ba\ua838'
+        u'\ufdfc\ufe69\uff04\uffe0\uffe1\uffe5\uffe6'
     )
 
     # Pad spaces after opening punctuations.
-    OPEN_PUNCT_RE = re.compile(u"([{}])".format(OPEN_PUNCT)), r"\1 "
+    OPEN_PUNCT_RE = re.compile(u'([{}])'.format(OPEN_PUNCT)), r'\1 '
     # Pad spaces before closing punctuations.
-    CLOSE_PUNCT_RE = re.compile(u"([{}])".format(CLOSE_PUNCT)), r"\1 "
+    CLOSE_PUNCT_RE = re.compile(u'([{}])'.format(CLOSE_PUNCT)), r'\1 '
     # Pad spaces after currency symbols.
-    CURRENCY_SYM_RE = re.compile(u"([{}])".format(CURRENCY_SYM)), r"\1 "
+    CURRENCY_SYM_RE = re.compile(u'([{}])'.format(CURRENCY_SYM)), r'\1 '
 
     # Use for tokenizing URL-unfriendly characters: [:/?#]
-    URL_FOE_1 = re.compile(r":(?!//)"), r" : "  # in perl s{:(?!//)}{ : }g;
-    URL_FOE_2 = re.compile(r"\?(?!\S)"), r" ? "  # in perl s{\?(?!\S)}{ ? }g;
+    URL_FOE_1 = re.compile(r':(?!//)'), r' : '  # in perl s{:(?!//)}{ : }g;
+    URL_FOE_2 = re.compile(r'\?(?!\S)'), r' ? '  # in perl s{\?(?!\S)}{ ? }g;
     # in perl: m{://} or m{\S+\.\S+/\S+} or s{/}{ / }g;
-    URL_FOE_3 = re.compile(r"(:\/\/)[\S+\.\S+\/\S+][\/]"), " / "
-    URL_FOE_4 = re.compile(r" /"), r" / "  # s{ /}{ / }g;
+    URL_FOE_3 = re.compile(r'(:\/\/)[\S+\.\S+\/\S+][\/]'), ' / '
+    URL_FOE_4 = re.compile(r' /'), r' / '  # s{ /}{ / }g;
 
     # Left/Right strip, i.e. remove heading/trailing spaces.
     # These strip regexes should NOT be used,
     # instead use str.lstrip(), str.rstrip() or str.strip()
     # (They are kept for reference purposes to the original toktok.pl code)
-    LSTRIP = re.compile(r"^ +"), ""
-    RSTRIP = re.compile(r"\s+$"), "\n"
+    LSTRIP = re.compile(r'^ +'), ''
+    RSTRIP = re.compile(r'\s+$'), '\n'
     # Merge multiple spaces.
-    ONE_SPACE = re.compile(r" {2,}"), " "
+    ONE_SPACE = re.compile(r' {2,}'), ' '
 
     TOKTOK_REGEXES = [
         NON_BREAKING,
@@ -171,10 +172,10 @@ class ToktokTokenizer(TokenizerI):
     ]
 
     def tokenize(self, text, return_str=False):
-        text = str(text)  # Converts input string into unicode.
+        text = text_type(text)  # Converts input string into unicode.
         for regexp, subsitution in self.TOKTOK_REGEXES:
             text = regexp.sub(subsitution, text)
         # Finally, strips heading and trailing spaces
         # and converts output string into unicode.
-        text = str(text.strip())
+        text = text_type(text.strip())
         return text if return_str else text.split()
index 593ff05..686d404 100644 (file)
@@ -1,6 +1,6 @@
 # Natural Language Toolkit: Tokenizers
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Edward Loper <edloper@gmail.com>
 #         Michael Heilman <mheilman@cmu.edu> (re-port from http://www.cis.upenn.edu/~treebank/tokenizer.sed)
 #
@@ -19,7 +19,25 @@ and available at http://www.cis.upenn.edu/~treebank/tokenizer.sed.
 import re
 from nltk.tokenize.api import TokenizerI
 from nltk.tokenize.util import align_tokens
-from nltk.tokenize.destructive import MacIntyreContractions
+
+
+class MacIntyreContractions:
+    """
+    List of contractions adapted from Robert MacIntyre's tokenizer.
+    """
+
+    CONTRACTIONS2 = [
+        r"(?i)\b(can)(?#X)(not)\b",
+        r"(?i)\b(d)(?#X)('ye)\b",
+        r"(?i)\b(gim)(?#X)(me)\b",
+        r"(?i)\b(gon)(?#X)(na)\b",
+        r"(?i)\b(got)(?#X)(ta)\b",
+        r"(?i)\b(lem)(?#X)(me)\b",
+        r"(?i)\b(mor)(?#X)('n)\b",
+        r"(?i)\b(wan)(?#X)(na)\s",
+    ]
+    CONTRACTIONS3 = [r"(?i) ('t)(?#X)(is)\b", r"(?i) ('t)(?#X)(was)\b"]
+    CONTRACTIONS4 = [r"(?i)\b(whad)(dd)(ya)\b", r"(?i)\b(wha)(t)(cha)\b"]
 
 
 class TreebankWordTokenizer(TokenizerI):
@@ -49,44 +67,44 @@ class TreebankWordTokenizer(TokenizerI):
 
     # starting quotes
     STARTING_QUOTES = [
-        (re.compile(r"^\""), r"``"),
-        (re.compile(r"(``)"), r" \1 "),
-        (re.compile(r"([ \(\[{<])(\"|\'{2})"), r"\1 `` "),
+        (re.compile(r'^\"'), r'``'),
+        (re.compile(r'(``)'), r' \1 '),
+        (re.compile(r"([ \(\[{<])(\"|\'{2})"), r'\1 `` '),
     ]
 
     # punctuation
     PUNCTUATION = [
-        (re.compile(r"([:,])([^\d])"), r" \1 \2"),
-        (re.compile(r"([:,])$"), r" \1 "),
-        (re.compile(r"\.\.\."), r" ... "),
-        (re.compile(r"[;@#$%&]"), r" \g<0> "),
+        (re.compile(r'([:,])([^\d])'), r' \1 \2'),
+        (re.compile(r'([:,])$'), r' \1 '),
+        (re.compile(r'\.\.\.'), r' ... '),
+        (re.compile(r'[;@#$%&]'), r' \g<0> '),
         (
             re.compile(r'([^\.])(\.)([\]\)}>"\']*)\s*$'),
-            r"\1 \2\3 ",
+            r'\1 \2\3 ',
         ),  # Handles the final period.
-        (re.compile(r"[?!]"), r" \g<0> "),
+        (re.compile(r'[?!]'), r' \g<0> '),
         (re.compile(r"([^'])' "), r"\1 ' "),
     ]
 
     # Pads parentheses
-    PARENS_BRACKETS = (re.compile(r"[\]\[\(\)\{\}\<\>]"), r" \g<0> ")
+    PARENS_BRACKETS = (re.compile(r'[\]\[\(\)\{\}\<\>]'), r' \g<0> ')
 
     # Optionally: Convert parentheses, brackets and converts them to PTB symbols.
     CONVERT_PARENTHESES = [
-        (re.compile(r"\("), "-LRB-"),
-        (re.compile(r"\)"), "-RRB-"),
-        (re.compile(r"\["), "-LSB-"),
-        (re.compile(r"\]"), "-RSB-"),
-        (re.compile(r"\{"), "-LCB-"),
-        (re.compile(r"\}"), "-RCB-"),
+        (re.compile(r'\('), '-LRB-'),
+        (re.compile(r'\)'), '-RRB-'),
+        (re.compile(r'\['), '-LSB-'),
+        (re.compile(r'\]'), '-RSB-'),
+        (re.compile(r'\{'), '-LCB-'),
+        (re.compile(r'\}'), '-RCB-'),
     ]
 
-    DOUBLE_DASHES = (re.compile(r"--"), r" -- ")
+    DOUBLE_DASHES = (re.compile(r'--'), r' -- ')
 
     # ending quotes
     ENDING_QUOTES = [
         (re.compile(r'"'), " '' "),
-        (re.compile(r"(\S)(\'\')"), r"\1 \2 "),
+        (re.compile(r'(\S)(\'\')'), r'\1 \2 '),
         (re.compile(r"([^' ])('[sS]|'[mM]|'[dD]|') "), r"\1 \2 "),
         (re.compile(r"([^' ])('ll|'LL|'re|'RE|'ve|'VE|n't|N'T) "), r"\1 \2 "),
     ]
@@ -122,9 +140,9 @@ class TreebankWordTokenizer(TokenizerI):
             text = regexp.sub(substitution, text)
 
         for regexp in self.CONTRACTIONS2:
-            text = regexp.sub(r" \1 \2 ", text)
+            text = regexp.sub(r' \1 \2 ', text)
         for regexp in self.CONTRACTIONS3:
-            text = regexp.sub(r" \1 \2 ", text)
+            text = regexp.sub(r' \1 \2 ', text)
 
         # We are not using CONTRACTIONS4 since
         # they are also commented out in the SED scripts
@@ -252,11 +270,11 @@ class TreebankWordDetokenizer(TokenizerI):
 
     _contractions = MacIntyreContractions()
     CONTRACTIONS2 = [
-        re.compile(pattern.replace("(?#X)", "\s"))
+        re.compile(pattern.replace('(?#X)', '\s'))
         for pattern in _contractions.CONTRACTIONS2
     ]
     CONTRACTIONS3 = [
-        re.compile(pattern.replace("(?#X)", "\s"))
+        re.compile(pattern.replace('(?#X)', '\s'))
         for pattern in _contractions.CONTRACTIONS3
     ]
 
@@ -264,75 +282,74 @@ class TreebankWordDetokenizer(TokenizerI):
     ENDING_QUOTES = [
         (re.compile(r"([^' ])\s('ll|'LL|'re|'RE|'ve|'VE|n't|N'T) "), r"\1\2 "),
         (re.compile(r"([^' ])\s('[sS]|'[mM]|'[dD]|') "), r"\1\2 "),
-        (re.compile(r"(\S)(\'\')"), r"\1\2 "),
+        (re.compile(r'(\S)(\'\')'), r'\1\2 '),
         (re.compile(r" '' "), '"'),
     ]
 
     # Handles double dashes
-    DOUBLE_DASHES = (re.compile(r" -- "), r"--")
+    DOUBLE_DASHES = (re.compile(r' -- '), r'--')
 
     # Optionally: Convert parentheses, brackets and converts them from PTB symbols.
     CONVERT_PARENTHESES = [
-        (re.compile("-LRB-"), "("),
-        (re.compile("-RRB-"), ")"),
-        (re.compile("-LSB-"), "["),
-        (re.compile("-RSB-"), "]"),
-        (re.compile("-LCB-"), "{"),
-        (re.compile("-RCB-"), "}"),
+        (re.compile('-LRB-'), '('),
+        (re.compile('-RRB-'), ')'),
+        (re.compile('-LSB-'), '['),
+        (re.compile('-RSB-'), ']'),
+        (re.compile('-LCB-'), '{'),
+        (re.compile('-RCB-'), '}'),
     ]
 
     # Undo padding on parentheses.
     PARENS_BRACKETS = [
-        (re.compile(r"\s([\[\(\{\<])\s"), r" \g<1>"),
-        (re.compile(r"\s([\]\)\}\>])\s"), r"\g<1> "),
-        (re.compile(r"([\]\)\}\>])\s([:;,.])"), r"\1\2"),
+        (re.compile(r'\s([\[\(\{\<])\s'), r' \g<1>'),
+        (re.compile(r'\s([\]\)\}\>])\s'), r'\g<1> '),
+        (re.compile(r'([\]\)\}\>])\s([:;,.])'), r'\1\2'),
     ]
 
     # punctuation
     PUNCTUATION = [
         (re.compile(r"([^'])\s'\s"), r"\1' "),
-        (re.compile(r"\s([?!])"), r"\g<1>"),  # Strip left pad for [?!]
+        (re.compile(r'\s([?!])'), r'\g<1>'),  # Strip left pad for [?!]
         # (re.compile(r'\s([?!])\s'), r'\g<1>'),
-        (re.compile(r'([^\.])\s(\.)([\]\)}>"\']*)\s*$'), r"\1\2\3"),
+        (re.compile(r'([^\.])\s(\.)([\]\)}>"\']*)\s*$'), r'\1\2\3'),
         # When tokenizing, [;@#$%&] are padded with whitespace regardless of
         # whether there are spaces before or after them.
         # But during detokenization, we need to distinguish between left/right
         # pad, so we split this up.
-        (re.compile(r"\s([#$])\s"), r" \g<1>"),  # Left pad.
-        (re.compile(r"\s([;%])\s"), r"\g<1> "),  # Right pad.
-        (re.compile(r"\s([&*])\s"), r" \g<1> "),  # Unknown pad.
-        (re.compile(r"\s\.\.\.\s"), r"..."),
-        (re.compile(r"\s([:,])\s$"), r"\1"),
+        (re.compile(r'\s([#$])\s'), r' \g<1>'),  # Left pad.
+        (re.compile(r'\s([;%])\s'), r'\g<1> '),  # Right pad.
+        (re.compile(r'\s([&])\s'), r' \g<1> '),  # Unknown pad.
+        (re.compile(r'\s\.\.\.\s'), r'...'),
+        (re.compile(r'\s([:,])\s$'), r'\1'),
         (
-            re.compile(r"\s([:,])\s([^\d])"),
-            r"\1 \2",
+            re.compile(r'\s([:,])\s([^\d])'),
+            r'\1 \2',
         )  # Keep right pad after comma/colon before non-digits.
         # (re.compile(r'\s([:,])\s([^\d])'), r'\1\2')
     ]
 
     # starting quotes
     STARTING_QUOTES = [
-        (re.compile(r"([ (\[{<])\s``"), r'\1"'),
-        (re.compile(r"\s(``)\s"), r"\1"),
-        (re.compile(r"^``"), r"\""),
+        (re.compile(r'([ (\[{<])\s``'), r'\1"'),
+        (re.compile(r'\s(``)\s'), r'\1'),
+        (re.compile(r'^``'), r'\"'),
     ]
 
     def tokenize(self, tokens, convert_parentheses=False):
         """
-        Treebank detokenizer, created by undoing the regexes from
-        the TreebankWordTokenizer.tokenize.
+        Python port of the Moses detokenizer.
 
         :param tokens: A list of strings, i.e. tokenized text.
         :type tokens: list(str)
         :return: str
         """
-        text = " ".join(tokens)
+        text = ' '.join(tokens)
         # Reverse the contractions regexes.
         # Note: CONTRACTIONS4 are not used in tokenization.
         for regexp in self.CONTRACTIONS3:
-            text = regexp.sub(r"\1\2", text)
+            text = regexp.sub(r'\1\2', text)
         for regexp in self.CONTRACTIONS2:
-            text = regexp.sub(r"\1\2", text)
+            text = regexp.sub(r'\1\2', text)
 
         # Reverse the regexes applied for ending quotes.
         for regexp, substitution in self.ENDING_QUOTES:
index be7c12b..a91f129 100644 (file)
@@ -1,7 +1,7 @@
 # -*- coding: utf-8 -*-
 # Natural Language Toolkit: Tokenizer Utilities
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Steven Bird <stevenbird1@gmail.com>
 # URL: <http://nltk.sourceforge.net>
 # For license information, see LICENSE.TXT
index f9b5caa..74f4dbc 100644 (file)
@@ -1,7 +1,7 @@
 # coding: utf-8
 # Natural Language Toolkit: Toolbox Reader
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Greg Aumann <greg_aumann@sil.org>
 # URL: <http://nltk.org>
 # For license information, see LICENSE.TXT
 Module for reading, writing and manipulating
 Toolbox databases and settings files.
 """
+from __future__ import print_function
 
 import re, codecs
 from xml.etree.ElementTree import ElementTree, TreeBuilder, Element, SubElement
-from io import StringIO
 
+from six import u
+
+from nltk.compat import StringIO, PY3
 from nltk.data import PathPointer, find
 
 
@@ -40,7 +43,7 @@ class StandardFormat(object):
             #      (PathPointer.open doesn't take a mode option)
             self._file = sfm_file.open(self._encoding)
         else:
-            self._file = codecs.open(sfm_file, "rU", self._encoding)
+            self._file = codecs.open(sfm_file, 'rU', self._encoding)
 
     def open_string(self, s):
         """
@@ -59,11 +62,11 @@ class StandardFormat(object):
 
         :rtype: iter(tuple(str, str))
         """
-        join_string = "\n"
-        line_regexp = r"^%s(?:\\(\S+)\s*)?(.*)$"
+        join_string = '\n'
+        line_regexp = r'^%s(?:\\(\S+)\s*)?(.*)$'
         # discard a BOM in the first line
-        first_line_pat = re.compile(line_regexp % "(?:\xef\xbb\xbf)?")
-        line_pat = re.compile(line_regexp % "")
+        first_line_pat = re.compile(line_regexp % '(?:\xef\xbb\xbf)?')
+        line_pat = re.compile(line_regexp % '')
         # need to get first line outside the loop for correct handling
         # of the first marker if it spans multiple lines
         file_iter = iter(self._file)
@@ -95,7 +98,7 @@ class StandardFormat(object):
         strip=True,
         unwrap=True,
         encoding=None,
-        errors="strict",
+        errors='strict',
         unicode_fields=None,
     ):
         """
@@ -122,11 +125,17 @@ class StandardFormat(object):
         :rtype: iter(tuple(str, str))
         """
         if encoding is None and unicode_fields is not None:
-            raise ValueError("unicode_fields is set but not encoding.")
-        unwrap_pat = re.compile(r"\n+")
+            raise ValueError('unicode_fields is set but not encoding.')
+        unwrap_pat = re.compile(r'\n+')
         for mkr, val in self.raw_fields():
+            if encoding and not PY3:  # kludge - already decoded in PY3?
+                if unicode_fields is not None and mkr in unicode_fields:
+                    val = val.decode('utf8', errors)
+                else:
+                    val = val.decode(encoding, errors)
+                mkr = mkr.decode(encoding, errors)
             if unwrap:
-                val = unwrap_pat.sub(" ", val)
+                val = unwrap_pat.sub(' ', val)
             if strip:
                 val = val.rstrip()
             yield (mkr, val)
@@ -200,27 +209,27 @@ class ToolboxData(StandardFormat):
         :return: contents of toolbox data divided into header and records
         """
         builder = TreeBuilder()
-        builder.start("toolbox_data", {})
-        builder.start("header", {})
+        builder.start('toolbox_data', {})
+        builder.start('header', {})
         in_records = False
         for mkr, value in self.fields(**kwargs):
-            if key is None and not in_records and mkr[0] != "_":
+            if key is None and not in_records and mkr[0] != '_':
                 key = mkr
             if mkr == key:
                 if in_records:
-                    builder.end("record")
+                    builder.end('record')
                 else:
-                    builder.end("header")
+                    builder.end('header')
                     in_records = True
-                builder.start("record", {})
+                builder.start('record', {})
             builder.start(mkr, {})
             builder.data(value)
             builder.end(mkr)
         if in_records:
-            builder.end("record")
+            builder.end('record')
         else:
-            builder.end("header")
-        builder.end("toolbox_data")
+            builder.end('header')
+        builder.end('toolbox_data')
         return builder.close()
 
     def _tree2etree(self, parent):
@@ -236,7 +245,7 @@ class ToolboxData(StandardFormat):
                 e.text = text
         return root
 
-    def _chunk_parse(self, grammar=None, root_label="record", trace=0, **kwargs):
+    def _chunk_parse(self, grammar=None, root_label='record', trace=0, **kwargs):
         """
         Returns an element tree structure corresponding to a toolbox data file
         parsed according to the chunk grammar.
@@ -261,10 +270,10 @@ class ToolboxData(StandardFormat):
 
         cp = chunk.RegexpParser(grammar, root_label=root_label, trace=trace)
         db = self.parse(**kwargs)
-        tb_etree = Element("toolbox_data")
-        header = db.find("header")
+        tb_etree = Element('toolbox_data')
+        header = db.find('header')
         tb_etree.append(header)
-        for record in db.findall("record"):
+        for record in db.findall('record'):
             parsed = cp.parse([(elem.text, elem.tag) for elem in record])
             tb_etree.append(self._tree2etree(parsed))
         return tb_etree
@@ -273,7 +282,7 @@ class ToolboxData(StandardFormat):
 _is_value = re.compile(r"\S")
 
 
-def to_sfm_string(tree, encoding=None, errors="strict", unicode_fields=None):
+def to_sfm_string(tree, encoding=None, errors='strict', unicode_fields=None):
     """
     Return a string with a standard format representation of the toolbox
     data in tree (tree can be a toolbox database or a single record).
@@ -289,12 +298,12 @@ def to_sfm_string(tree, encoding=None, errors="strict", unicode_fields=None):
     :type unicode_fields: dict(str) or set(str)
     :rtype: str
     """
-    if tree.tag == "record":
-        root = Element("toolbox_data")
+    if tree.tag == 'record':
+        root = Element('toolbox_data')
         root.append(tree)
         tree = root
 
-    if tree.tag != "toolbox_data":
+    if tree.tag != 'toolbox_data':
         raise ValueError("not a toolbox_data element structure")
     if encoding is None and unicode_fields is not None:
         raise ValueError(
@@ -302,29 +311,29 @@ def to_sfm_string(tree, encoding=None, errors="strict", unicode_fields=None):
         )
     l = []
     for rec in tree:
-        l.append("\n")
+        l.append('\n')
         for field in rec:
             mkr = field.tag
             value = field.text
             if encoding is not None:
                 if unicode_fields is not None and mkr in unicode_fields:
-                    cur_encoding = "utf8"
+                    cur_encoding = 'utf8'
                 else:
                     cur_encoding = encoding
                 if re.search(_is_value, value):
                     l.append(
-                        ("\\%s %s\n" % (mkr, value)).encode(cur_encoding, errors)
+                        (u("\\%s %s\n") % (mkr, value)).encode(cur_encoding, errors)
                     )
                 else:
                     l.append(
-                        ("\\%s%s\n" % (mkr, value)).encode(cur_encoding, errors)
+                        (u("\\%s%s\n") % (mkr, value)).encode(cur_encoding, errors)
                     )
             else:
                 if re.search(_is_value, value):
                     l.append("\\%s %s\n" % (mkr, value))
                 else:
                     l.append("\\%s%s\n" % (mkr, value))
-    return "".join(l[1:])
+    return ''.join(l[1:])
 
 
 class ToolboxSettings(StandardFormat):
@@ -333,7 +342,7 @@ class ToolboxSettings(StandardFormat):
     def __init__(self):
         super(ToolboxSettings, self).__init__()
 
-    def parse(self, encoding=None, errors="strict", **kwargs):
+    def parse(self, encoding=None, errors='strict', **kwargs):
         """
         Return the contents of toolbox settings file with a nested structure.
 
@@ -358,7 +367,7 @@ class ToolboxSettings(StandardFormat):
             if block == "+":
                 builder.start(mkr, {})
                 builder.data(value)
-            elif block == "-":
+            elif block == '-':
                 builder.end(mkr)
             else:
                 builder.start(mkr, {})
@@ -367,7 +376,7 @@ class ToolboxSettings(StandardFormat):
         return builder.close()
 
 
-def to_settings_string(tree, encoding=None, errors="strict", unicode_fields=None):
+def to_settings_string(tree, encoding=None, errors='strict', unicode_fields=None):
     # write XML to file
     l = list()
     _to_settings_string(
@@ -377,7 +386,7 @@ def to_settings_string(tree, encoding=None, errors="strict", unicode_fields=None
         errors=errors,
         unicode_fields=unicode_fields,
     )
-    return "".join(l)
+    return ''.join(l)
 
 
 def _to_settings_string(node, l, **kwargs):
@@ -386,17 +395,17 @@ def _to_settings_string(node, l, **kwargs):
     text = node.text
     if len(node) == 0:
         if text:
-            l.append("\\%s %s\n" % (tag, text))
+            l.append('\\%s %s\n' % (tag, text))
         else:
-            l.append("\\%s\n" % tag)
+            l.append('\\%s\n' % tag)
     else:
         if text:
-            l.append("\\+%s %s\n" % (tag, text))
+            l.append('\\+%s %s\n' % (tag, text))
         else:
-            l.append("\\+%s\n" % tag)
+            l.append('\\+%s\n' % tag)
         for n in node:
             _to_settings_string(n, l, **kwargs)
-        l.append("\\-%s\n" % tag)
+        l.append('\\-%s\n' % tag)
     return
 
 
@@ -502,29 +511,29 @@ def demo():
 
     #    zip_path = find('corpora/toolbox.zip')
     #    lexicon = ToolboxData(ZipFilePathPointer(zip_path, 'toolbox/rotokas.dic')).parse()
-    file_path = find("corpora/toolbox/rotokas.dic")
+    file_path = find('corpora/toolbox/rotokas.dic')
     lexicon = ToolboxData(file_path).parse()
-    print("first field in fourth record:")
+    print('first field in fourth record:')
     print(lexicon[3][0].tag)
     print(lexicon[3][0].text)
 
-    print("\nfields in sequential order:")
-    for field in islice(lexicon.find("record"), 10):
+    print('\nfields in sequential order:')
+    for field in islice(lexicon.find('record'), 10):
         print(field.tag, field.text)
 
-    print("\nlx fields:")
-    for field in islice(lexicon.findall("record/lx"), 10):
+    print('\nlx fields:')
+    for field in islice(lexicon.findall('record/lx'), 10):
         print(field.text)
 
     settings = ToolboxSettings()
-    file_path = find("corpora/toolbox/MDF/MDF_AltH.typ")
+    file_path = find('corpora/toolbox/MDF/MDF_AltH.typ')
     settings.open(file_path)
     #    settings.open(ZipFilePathPointer(zip_path, entry='toolbox/MDF/MDF_AltH.typ'))
-    tree = settings.parse(unwrap=False, encoding="cp1252")
-    print(tree.find("expset/expMDF/rtfPageSetup/paperSize").text)
+    tree = settings.parse(unwrap=False, encoding='cp1252')
+    print(tree.find('expset/expMDF/rtfPageSetup/paperSize').text)
     settings_tree = ElementTree(tree)
-    print(to_settings_string(settings_tree).encode("utf8"))
+    print(to_settings_string(settings_tree).encode('utf8'))
 
 
-if __name__ == "__main__":
+if __name__ == '__main__':
     demo()
index 21ddf8a..3a1b2e5 100644 (file)
@@ -1,7 +1,7 @@
 # -*- coding: utf-8 -*-
 # Natural Language Toolkit: Machine Translation
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Steven Bird <stevenbird1@gmail.com>, Tah Wei Hoon <hoon.tw@gmail.com>
 # URL: <http://nltk.org/>
 # For license information, see LICENSE.TXT
@@ -20,6 +20,5 @@ from nltk.translate.ibm4 import IBMModel4
 from nltk.translate.ibm5 import IBMModel5
 from nltk.translate.bleu_score import sentence_bleu as bleu
 from nltk.translate.ribes_score import sentence_ribes as ribes
-from nltk.translate.meteor_score import meteor_score as meteor
 from nltk.translate.metrics import alignment_error_rate
 from nltk.translate.stack_decoder import StackDecoder
index 6e89142..0c44521 100644 (file)
Binary files a/nlp_resource_data/nltk/translate/__pycache__/__init__.cpython-37.pyc and b/nlp_resource_data/nltk/translate/__pycache__/__init__.cpython-37.pyc differ
index 51ca6de..66ec1c6 100644 (file)
Binary files a/nlp_resource_data/nltk/translate/__pycache__/api.cpython-37.pyc and b/nlp_resource_data/nltk/translate/__pycache__/api.cpython-37.pyc differ
index 4ab78a5..d1a3787 100644 (file)
Binary files a/nlp_resource_data/nltk/translate/__pycache__/bleu_score.cpython-37.pyc and b/nlp_resource_data/nltk/translate/__pycache__/bleu_score.cpython-37.pyc differ
index 431f99e..6325651 100644 (file)
Binary files a/nlp_resource_data/nltk/translate/__pycache__/chrf_score.cpython-37.pyc and b/nlp_resource_data/nltk/translate/__pycache__/chrf_score.cpython-37.pyc differ
index 0be5c76..57322b4 100644 (file)
Binary files a/nlp_resource_data/nltk/translate/__pycache__/gale_church.cpython-37.pyc and b/nlp_resource_data/nltk/translate/__pycache__/gale_church.cpython-37.pyc differ
index c41e3e3..b81fcd3 100644 (file)
Binary files a/nlp_resource_data/nltk/translate/__pycache__/gdfa.cpython-37.pyc and b/nlp_resource_data/nltk/translate/__pycache__/gdfa.cpython-37.pyc differ
index c82a872..359001f 100644 (file)
Binary files a/nlp_resource_data/nltk/translate/__pycache__/gleu_score.cpython-37.pyc and b/nlp_resource_data/nltk/translate/__pycache__/gleu_score.cpython-37.pyc differ
index 42b670d..9ca23b5 100644 (file)
Binary files a/nlp_resource_data/nltk/translate/__pycache__/ibm1.cpython-37.pyc and b/nlp_resource_data/nltk/translate/__pycache__/ibm1.cpython-37.pyc differ
index 1c38ba0..b3900c1 100644 (file)
Binary files a/nlp_resource_data/nltk/translate/__pycache__/ibm2.cpython-37.pyc and b/nlp_resource_data/nltk/translate/__pycache__/ibm2.cpython-37.pyc differ
index b85e973..20e7de5 100644 (file)
Binary files a/nlp_resource_data/nltk/translate/__pycache__/ibm3.cpython-37.pyc and b/nlp_resource_data/nltk/translate/__pycache__/ibm3.cpython-37.pyc differ
index f36c4dc..063087f 100644 (file)
Binary files a/nlp_resource_data/nltk/translate/__pycache__/ibm4.cpython-37.pyc and b/nlp_resource_data/nltk/translate/__pycache__/ibm4.cpython-37.pyc differ
index 758fd67..fc544f7 100644 (file)
Binary files a/nlp_resource_data/nltk/translate/__pycache__/ibm5.cpython-37.pyc and b/nlp_resource_data/nltk/translate/__pycache__/ibm5.cpython-37.pyc differ
index 86f8784..313bb17 100644 (file)
Binary files a/nlp_resource_data/nltk/translate/__pycache__/ibm_model.cpython-37.pyc and b/nlp_resource_data/nltk/translate/__pycache__/ibm_model.cpython-37.pyc differ
diff --git a/nlp_resource_data/nltk/translate/__pycache__/meteor_score.cpython-37.pyc b/nlp_resource_data/nltk/translate/__pycache__/meteor_score.cpython-37.pyc
deleted file mode 100644 (file)
index 9afe132..0000000
Binary files a/nlp_resource_data/nltk/translate/__pycache__/meteor_score.cpython-37.pyc and /dev/null differ
index 3cc8a31..085b024 100644 (file)
Binary files a/nlp_resource_data/nltk/translate/__pycache__/metrics.cpython-37.pyc and b/nlp_resource_data/nltk/translate/__pycache__/metrics.cpython-37.pyc differ
index a22b4ea..afaa828 100644 (file)
Binary files a/nlp_resource_data/nltk/translate/__pycache__/nist_score.cpython-37.pyc and b/nlp_resource_data/nltk/translate/__pycache__/nist_score.cpython-37.pyc differ
index c76d2fc..16a3cc7 100644 (file)
Binary files a/nlp_resource_data/nltk/translate/__pycache__/phrase_based.cpython-37.pyc and b/nlp_resource_data/nltk/translate/__pycache__/phrase_based.cpython-37.pyc differ
index 85b6ca2..c23dd80 100644 (file)
Binary files a/nlp_resource_data/nltk/translate/__pycache__/ribes_score.cpython-37.pyc and b/nlp_resource_data/nltk/translate/__pycache__/ribes_score.cpython-37.pyc differ
index ae9377f..fddf259 100644 (file)
Binary files a/nlp_resource_data/nltk/translate/__pycache__/stack_decoder.cpython-37.pyc and b/nlp_resource_data/nltk/translate/__pycache__/stack_decoder.cpython-37.pyc differ
index 9efcbde..b889410 100644 (file)
@@ -1,6 +1,6 @@
 # Natural Language Toolkit: API for alignment and translation objects
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Will Zhang <wilzzha@gmail.com>
 #         Guan Gui <ggui@student.unimelb.edu.au>
 #         Steven Bird <stevenbird1@gmail.com>
@@ -8,10 +8,14 @@
 # URL: <http://nltk.org/>
 # For license information, see LICENSE.TXT
 
+from __future__ import print_function, unicode_literals
 import subprocess
 from collections import namedtuple
 
+from nltk.compat import python_2_unicode_compatible
 
+
+@python_2_unicode_compatible
 class AlignedSent(object):
     """
     Return an aligned sentence object, which encapsulates two sentences
@@ -85,8 +89,8 @@ class AlignedSent(object):
         """
         Dot representation of the aligned sentence
         """
-        s = "graph align {\n"
-        s += "node[shape=plaintext]\n"
+        s = 'graph align {\n'
+        s += 'node[shape=plaintext]\n'
 
         # Declare node
         for w in self._words:
@@ -114,10 +118,10 @@ class AlignedSent(object):
             )
 
         # Put it in the same rank
-        s += "{rank = same; %s}\n" % (" ".join('"%s_source"' % w for w in self._words))
-        s += "{rank = same; %s}\n" % (" ".join('"%s_target"' % w for w in self._mots))
+        s += '{rank = same; %s}\n' % (' '.join('"%s_source"' % w for w in self._words))
+        s += '{rank = same; %s}\n' % (' '.join('"%s_target"' % w for w in self._mots))
 
-        s += "}"
+        s += '}'
 
         return s
 
@@ -125,20 +129,20 @@ class AlignedSent(object):
         """
         Ipython magic : show SVG representation of this ``AlignedSent``.
         """
-        dot_string = self._to_dot().encode("utf8")
-        output_format = "svg"
+        dot_string = self._to_dot().encode('utf8')
+        output_format = 'svg'
         try:
             process = subprocess.Popen(
-                ["dot", "-T%s" % output_format],
+                ['dot', '-T%s' % output_format],
                 stdin=subprocess.PIPE,
                 stdout=subprocess.PIPE,
                 stderr=subprocess.PIPE,
             )
         except OSError:
-            raise Exception("Cannot find the dot binary from Graphviz package")
+            raise Exception('Cannot find the dot binary from Graphviz package')
         out, err = process.communicate(dot_string)
 
-        return out.decode("utf8")
+        return out.decode('utf8')
 
     def __str__(self):
         """
@@ -159,6 +163,7 @@ class AlignedSent(object):
         return AlignedSent(self._mots, self._words, self._alignment.invert())
 
 
+@python_2_unicode_compatible
 class Alignment(frozenset):
     """
     A storage class for representing alignment between two sequences, s1, s2.
@@ -288,7 +293,7 @@ def _check_alignment(num_words, num_mots, alignment):
         raise IndexError("Alignment is outside boundary of mots")
 
 
-PhraseTableEntry = namedtuple("PhraseTableEntry", ["trg_phrase", "log_prob"])
+PhraseTableEntry = namedtuple('PhraseTableEntry', ['trg_phrase', 'log_prob'])
 
 
 class PhraseTable(object):
index a6a79a1..4617203 100644 (file)
@@ -1,22 +1,29 @@
 # -*- coding: utf-8 -*-
 # Natural Language Toolkit: BLEU Score
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Authors: Chin Yee Lee, Hengfeng Li, Ruxin Hou, Calvin Tanujaya Lim
 # Contributors: Björn Mattsson, Dmitrijs Milajevs, Liling Tan
 # URL: <http://nltk.org/>
 # For license information, see LICENSE.TXT
 
 """BLEU score implementation."""
+from __future__ import division
 
 import math
 import sys
-from fractions import Fraction
+import fractions
 import warnings
 from collections import Counter
 
 from nltk.util import ngrams
 
+try:
+    fractions.Fraction(0, 1000, _normalize=False)
+    from fractions import Fraction
+except TypeError:
+    from nltk.compat import Fraction
+
 
 def sentence_bleu(
     references,
@@ -467,23 +474,23 @@ class SmoothingFunction:
         ...               'Party', 'commands']
 
         >>> chencherry = SmoothingFunction()
-        >>> print(sentence_bleu([reference1], hypothesis1)) # doctest: +ELLIPSIS
+        >>> print (sentence_bleu([reference1], hypothesis1)) # doctest: +ELLIPSIS
         0.4118...
-        >>> print(sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method0)) # doctest: +ELLIPSIS
+        >>> print (sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method0)) # doctest: +ELLIPSIS
         0.4118...
-        >>> print(sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method1)) # doctest: +ELLIPSIS
+        >>> print (sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method1)) # doctest: +ELLIPSIS
         0.4118...
-        >>> print(sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method2)) # doctest: +ELLIPSIS
+        >>> print (sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method2)) # doctest: +ELLIPSIS
         0.4489...
-        >>> print(sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method3)) # doctest: +ELLIPSIS
+        >>> print (sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method3)) # doctest: +ELLIPSIS
         0.4118...
-        >>> print(sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method4)) # doctest: +ELLIPSIS
+        >>> print (sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method4)) # doctest: +ELLIPSIS
         0.4118...
-        >>> print(sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method5)) # doctest: +ELLIPSIS
+        >>> print (sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method5)) # doctest: +ELLIPSIS
         0.4905...
-        >>> print(sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method6)) # doctest: +ELLIPSIS
+        >>> print (sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method6)) # doctest: +ELLIPSIS
         0.4135...
-        >>> print(sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method7)) # doctest: +ELLIPSIS
+        >>> print (sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method7)) # doctest: +ELLIPSIS
         0.4905...
 
         :param epsilon: the epsilon value use in method 1
@@ -567,7 +574,7 @@ class SmoothingFunction:
                 incvnt += 1
         return p_n
 
-    def method4(self, p_n, references, hypothesis, hyp_len=None, *args, **kwargs):
+    def method4(self, p_n, references, hypothesis, hyp_len, *args, **kwargs):
         """
         Smoothing method 4:
         Shorter translations may have inflated precision values due to having
@@ -575,23 +582,21 @@ class SmoothingFunction:
         smaller smoothed counts. Instead of scaling to 1/(2^k), Chen and Cherry
         suggests dividing by 1/ln(len(T)), where T is the length of the translation.
         """
-        hyp_len = hyp_len if hyp_len else len(hypothesis)
         for i, p_i in enumerate(p_n):
             if p_i.numerator == 0 and hyp_len != 0:
                 incvnt = i + 1 * self.k / math.log(
                     hyp_len
                 )  # Note that this K is different from the K from NIST.
-                p_n[i] = incvnt / p_i.denominator
+                p_n[i] = 1 / incvnt
         return p_n
 
-    def method5(self, p_n, references, hypothesis, hyp_len=None, *args, **kwargs):
+    def method5(self, p_n, references, hypothesis, hyp_len, *args, **kwargs):
         """
         Smoothing method 5:
         The matched counts for similar values of n should be similar. To a
         calculate the n-gram matched count, it averages the n−1, n and n+1 gram
         matched counts.
         """
-        hyp_len = hyp_len if hyp_len else len(hypothesis)
         m = {}
         # Requires an precision value for an addition ngram order.
         p_n_plus1 = p_n + [modified_precision(references, hypothesis, 5)]
@@ -601,7 +606,7 @@ class SmoothingFunction:
             m[i] = p_n[i]
         return p_n
 
-    def method6(self, p_n, references, hypothesis, hyp_len=None, *args, **kwargs):
+    def method6(self, p_n, references, hypothesis, hyp_len, *args, **kwargs):
         """
         Smoothing method 6:
         Interpolates the maximum likelihood estimate of the precision *p_n* with
@@ -610,7 +615,6 @@ class SmoothingFunction:
         Gao and He (2013) Training MRF-Based Phrase Translation Models using
         Gradient Ascent. In NAACL.
         """
-        hyp_len = hyp_len if hyp_len else len(hypothesis)
         # This smoothing only works when p_1 and p_2 is non-zero.
         # Raise an error with an appropriate message when the input is too short
         # to use this smoothing technique.
@@ -628,12 +632,13 @@ class SmoothingFunction:
                 p_n[i] = (m + self.alpha * pi0) / (l + self.alpha)
         return p_n
 
-    def method7(self, p_n, references, hypothesis, hyp_len=None, *args, **kwargs):
+    def method7(self, p_n, references, hypothesis, hyp_len, *args, **kwargs):
         """
-        Smoothing method 7:
-        Interpolates methods 5 and 6.
+        Smoothing method 6:
+        Interpolates the maximum likelihood estimate of the precision *p_n* with
+        a prior estimate *pi0*. The prior is estimated by assuming that the ratio
+        between pn and pn−1 will be the same as that between pn−1 and pn−2.
         """
-        hyp_len = hyp_len if hyp_len else len(hypothesis)
         p_n = self.method4(p_n, references, hypothesis, hyp_len)
         p_n = self.method5(p_n, references, hypothesis, hyp_len)
         return p_n
index ef5fb90..f77a026 100644 (file)
@@ -1,13 +1,14 @@
 # -*- coding: utf-8 -*-
 # Natural Language Toolkit: ChrF score
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Authors: Maja Popovic
 # Contributors: Liling Tan, Aleš Tamchyna (Memsource)
 # URL: <http://nltk.org/>
 # For license information, see LICENSE.TXT
 
 """ ChrF score implementation """
+from __future__ import division
 from collections import Counter, defaultdict
 import re
 
@@ -102,10 +103,10 @@ def sentence_chrf(
 def _preprocess(sent, ignore_whitespace):
     if type(sent) != str:
         # turn list of tokens into a string
-        sent = " ".join(sent)
+        sent = ' '.join(sent)
 
     if ignore_whitespace:
-        sent = re.sub(r"\s+", "", sent)
+        sent = re.sub(r'\s+', '', sent)
     return sent
 
 
index 80aa4c1..582951c 100644 (file)
@@ -2,7 +2,7 @@
 
 # Natural Language Toolkit: Gale-Church Aligner
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Torsten Marek <marek@ifi.uzh.ch>
 # Contributor: Cassidy Laidlaw, Liling Tan
 # URL: <http://nltk.org/>
@@ -17,6 +17,7 @@ http://aclweb.org/anthology/J93-1004.pdf
 
 """
 
+from __future__ import division
 import math
 
 try:
@@ -71,7 +72,7 @@ except ImportError:
         try:
             return math.log(1 - norm_cdf(x))
         except ValueError:
-            return float("-inf")
+            return float('-inf')
 
 
 LOG2 = math.log(2)
@@ -146,7 +147,7 @@ def align_log_prob(i, j, source_sents, target_sents, alignment, params):
             m * params.VARIANCE_CHARACTERS
         )
     except ZeroDivisionError:
-        return float("-inf")
+        return float('-inf')
 
     return -(LOG2 + norm_logsf(abs(delta)) + math.log(params.PRIORS[alignment]))
 
@@ -178,7 +179,7 @@ def align_blocks(source_sents_lens, target_sents_lens, params=LanguageIndependen
 
     for i in range(len(source_sents_lens) + 1):
         for j in range(len(target_sents_lens) + 1):
-            min_dist = float("inf")
+            min_dist = float('inf')
             min_align = None
             for a in alignment_types:
                 prev_i = -1 - a[0]
@@ -192,7 +193,7 @@ def align_blocks(source_sents_lens, target_sents_lens, params=LanguageIndependen
                     min_dist = p
                     min_align = a
 
-            if min_dist == float("inf"):
+            if min_dist == float('inf'):
                 min_dist = 0
 
             backlinks[(i, j)] = min_align
@@ -264,3 +265,11 @@ def parse_token_stream(stream, soft_delimiter, hard_delimiter):
         for block_it in split_at(stream, hard_delimiter)
     ]
 
+
+#    Code for test files in nltk_contrib/align/data/*.tok
+#    import sys
+#    from contextlib import nested
+#    with nested(open(sys.argv[1], "r"), open(sys.argv[2], "r")) as (s, t):
+#        source = parse_token_stream((l.strip() for l in s), ".EOS", ".EOP")
+#        target = parse_token_stream((l.strip() for l in t), ".EOS", ".EOP")
+#        print align_texts(source, target)
index bc0e91b..bdea805 100644 (file)
@@ -1,7 +1,7 @@
 # -*- coding: utf-8 -*-
 # Natural Language Toolkit: GDFA word alignment symmetrization
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Authors: Liling Tan
 # URL: <http://nltk.org/>
 # For license information, see LICENSE.TXT
@@ -67,8 +67,8 @@ def grow_diag_final_and(srclen, trglen, e2f, f2e):
     """
 
     # Converts pharaoh text format into list of tuples.
-    e2f = [tuple(map(int, a.split("-"))) for a in e2f.split()]
-    f2e = [tuple(map(int, a.split("-"))) for a in f2e.split()]
+    e2f = [tuple(map(int, a.split('-'))) for a in e2f.split()]
+    f2e = [tuple(map(int, a.split('-'))) for a in f2e.split()]
 
     neighbors = [(-1, 0), (0, -1), (1, 0), (0, 1), (-1, -1), (-1, 1), (1, -1), (1, 1)]
     alignment = set(e2f).intersection(set(f2e))  # Find the intersection.
@@ -77,8 +77,8 @@ def grow_diag_final_and(srclen, trglen, e2f, f2e):
     # *aligned* is used to check if neighbors are aligned in grow_diag()
     aligned = defaultdict(set)
     for i, j in alignment:
-        aligned["e"].add(i)
-        aligned["f"].add(j)
+        aligned['e'].add(i)
+        aligned['f'].add(j)
 
     def grow_diag():
         """
@@ -105,8 +105,8 @@ def grow_diag_final_and(srclen, trglen, e2f, f2e):
                                 e_new not in aligned and f_new not in aligned
                             ) and neighbor in union:
                                 alignment.add(neighbor)
-                                aligned["e"].add(e_new)
-                                aligned["f"].add(f_new)
+                                aligned['e'].add(e_new)
+                                aligned['f'].add(f_new)
                                 prev_len += 1
                                 no_new_points = False
             # iterate until no new points added
@@ -130,8 +130,8 @@ def grow_diag_final_and(srclen, trglen, e2f, f2e):
                     and (e_new, f_new) in union
                 ):
                     alignment.add((e_new, f_new))
-                    aligned["e"].add(e_new)
-                    aligned["f"].add(f_new)
+                    aligned['e'].add(e_new)
+                    aligned['f'].add(f_new)
 
     grow_diag()
     final_and(e2f)
index 9fe7214..43c3e99 100644 (file)
@@ -1,14 +1,14 @@
 # -*- coding: utf-8 -*-
 # Natural Language Toolkit: GLEU Score
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Authors:
 # Contributors: Mike Schuster, Michael Wayne Goodman, Liling Tan
 # URL: <http://nltk.org/>
 # For license information, see LICENSE.TXT
 
 """ GLEU score implementation. """
-
+from __future__ import division
 from collections import Counter
 
 from nltk.util import ngrams, everygrams
index 013f5e4..ff243fd 100644 (file)
@@ -63,6 +63,7 @@ Translation: Parameter Estimation. Computational Linguistics, 19 (2),
 263-311.
 """
 
+from __future__ import division
 from collections import defaultdict
 from nltk.translate import AlignedSent
 from nltk.translate import Alignment
@@ -132,7 +133,7 @@ class IBMModel1(IBMModel):
             self.set_uniform_probabilities(sentence_aligned_corpus)
         else:
             # Set user-defined probabilities
-            self.translation_table = probability_tables["translation_table"]
+            self.translation_table = probability_tables['translation_table']
 
         for n in range(0, iterations):
             self.train(sentence_aligned_corpus)
index a806d41..e235f59 100644 (file)
@@ -46,6 +46,8 @@ Translation: Parameter Estimation. Computational Linguistics, 19 (2),
 263-311.
 """
 
+from __future__ import division
+
 import warnings
 from collections import defaultdict
 
@@ -129,8 +131,8 @@ class IBMModel2(IBMModel):
             self.set_uniform_probabilities(sentence_aligned_corpus)
         else:
             # Set user-defined probabilities
-            self.translation_table = probability_tables["translation_table"]
-            self.alignment_table = probability_tables["alignment_table"]
+            self.translation_table = probability_tables['translation_table']
+            self.alignment_table = probability_tables['alignment_table']
 
         for n in range(0, iterations):
             self.train(sentence_aligned_corpus)
@@ -161,7 +163,7 @@ class IBMModel2(IBMModel):
         counts = Model2Counts()
         for aligned_sentence in parallel_corpus:
             src_sentence = [None] + aligned_sentence.mots
-            trg_sentence = ["UNUSED"] + aligned_sentence.words  # 1-indexed
+            trg_sentence = ['UNUSED'] + aligned_sentence.words  # 1-indexed
             l = len(aligned_sentence.mots)
             m = len(aligned_sentence.words)
 
index ed491f9..2c7c618 100644 (file)
@@ -73,6 +73,8 @@ Translation: Parameter Estimation. Computational Linguistics, 19 (2),
 263-311.
 """
 
+from __future__ import division
+
 import warnings
 from collections import defaultdict
 from math import factorial
@@ -169,11 +171,11 @@ class IBMModel3(IBMModel):
             self.set_uniform_probabilities(sentence_aligned_corpus)
         else:
             # Set user-defined probabilities
-            self.translation_table = probability_tables["translation_table"]
-            self.alignment_table = probability_tables["alignment_table"]
-            self.fertility_table = probability_tables["fertility_table"]
-            self.p1 = probability_tables["p1"]
-            self.distortion_table = probability_tables["distortion_table"]
+            self.translation_table = probability_tables['translation_table']
+            self.alignment_table = probability_tables['alignment_table']
+            self.fertility_table = probability_tables['fertility_table']
+            self.p1 = probability_tables['p1']
+            self.distortion_table = probability_tables['distortion_table']
 
         for n in range(0, iterations):
             self.train(sentence_aligned_corpus)
index fc6c295..323dd4d 100644 (file)
@@ -1,7 +1,7 @@
 # -*- coding: utf-8 -*-
 # Natural Language Toolkit: IBM Model 4
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Tah Wei Hoon <hoon.tw@gmail.com>
 # URL: <http://nltk.org/>
 # For license information, see LICENSE.TXT
@@ -101,6 +101,8 @@ Translation: Parameter Estimation. Computational Linguistics, 19 (2),
 263-311.
 """
 
+from __future__ import division
+
 import warnings
 from collections import defaultdict
 from math import factorial
@@ -220,13 +222,13 @@ class IBMModel4(IBMModel):
             self.set_uniform_probabilities(sentence_aligned_corpus)
         else:
             # Set user-defined probabilities
-            self.translation_table = probability_tables["translation_table"]
-            self.alignment_table = probability_tables["alignment_table"]
-            self.fertility_table = probability_tables["fertility_table"]
-            self.p1 = probability_tables["p1"]
-            self.head_distortion_table = probability_tables["head_distortion_table"]
+            self.translation_table = probability_tables['translation_table']
+            self.alignment_table = probability_tables['alignment_table']
+            self.fertility_table = probability_tables['fertility_table']
+            self.p1 = probability_tables['p1']
+            self.head_distortion_table = probability_tables['head_distortion_table']
             self.non_head_distortion_table = probability_tables[
-                "non_head_distortion_table"
+                'non_head_distortion_table'
             ]
 
         for n in range(0, iterations):
index 88a64f2..b1b44e7 100644 (file)
@@ -1,7 +1,7 @@
 # -*- coding: utf-8 -*-
 # Natural Language Toolkit: IBM Model 5
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Tah Wei Hoon <hoon.tw@gmail.com>
 # URL: <http://nltk.org/>
 # For license information, see LICENSE.TXT
@@ -111,6 +111,8 @@ Translation: Parameter Estimation. Computational Linguistics, 19 (2),
 263-311.
 """
 
+from __future__ import division
+
 import warnings
 from collections import defaultdict
 from math import factorial
@@ -236,16 +238,16 @@ class IBMModel5(IBMModel):
             self.set_uniform_probabilities(sentence_aligned_corpus)
         else:
             # Set user-defined probabilities
-            self.translation_table = probability_tables["translation_table"]
-            self.alignment_table = probability_tables["alignment_table"]
-            self.fertility_table = probability_tables["fertility_table"]
-            self.p1 = probability_tables["p1"]
-            self.head_distortion_table = probability_tables["head_distortion_table"]
+            self.translation_table = probability_tables['translation_table']
+            self.alignment_table = probability_tables['alignment_table']
+            self.fertility_table = probability_tables['fertility_table']
+            self.p1 = probability_tables['p1']
+            self.head_distortion_table = probability_tables['head_distortion_table']
             self.non_head_distortion_table = probability_tables[
-                "non_head_distortion_table"
+                'non_head_distortion_table'
             ]
-            self.head_vacancy_table = probability_tables["head_vacancy_table"]
-            self.non_head_vacancy_table = probability_tables["non_head_vacancy_table"]
+            self.head_vacancy_table = probability_tables['head_vacancy_table']
+            self.non_head_vacancy_table = probability_tables['non_head_vacancy_table']
 
         for n in range(0, iterations):
             self.train(sentence_aligned_corpus)
index 3b9b913..24f6928 100644 (file)
@@ -1,7 +1,7 @@
 # -*- coding: utf-8 -*-
 # Natural Language Toolkit: IBM Model Core
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Tah Wei Hoon <hoon.tw@gmail.com>
 # URL: <http://nltk.org/>
 # For license information, see LICENSE.TXT
@@ -37,7 +37,7 @@ Robert L. Mercer. 1993. The Mathematics of Statistical Machine
 Translation: Parameter Estimation. Computational Linguistics, 19 (2),
 263-311.
 """
-
+from __future__ import division
 from bisect import insort_left
 from collections import defaultdict
 from copy import deepcopy
@@ -201,7 +201,7 @@ class IBMModel(object):
         :type i_pegged: int
         """
         src_sentence = [None] + sentence_pair.mots
-        trg_sentence = ["UNUSED"] + sentence_pair.words  # 1-indexed
+        trg_sentence = ['UNUSED'] + sentence_pair.words  # 1-indexed
 
         l = len(src_sentence) - 1  # exclude NULL
         m = len(trg_sentence) - 1
diff --git a/nlp_resource_data/nltk/translate/meteor_score.py b/nlp_resource_data/nltk/translate/meteor_score.py
deleted file mode 100644 (file)
index 008836f..0000000
+++ /dev/null
@@ -1,434 +0,0 @@
-# -*- coding: utf-8 -*-
-# Natural Language Toolkit: Machine Translation
-#
-# Copyright (C) 2001-2020 NLTK Project
-# Author: Uday Krishna <udaykrishna5@gmail.com>
-# URL: <http://nltk.org/>
-# For license information, see LICENSE.TXT
-
-
-from nltk.stem.porter import PorterStemmer
-from nltk.corpus import wordnet
-from itertools import chain, product
-
-
-def _generate_enums(hypothesis, reference, preprocess=str.lower):
-    """
-    Takes in string inputs for hypothesis and reference and returns
-    enumerated word lists for each of them
-
-    :param hypothesis: hypothesis string
-    :type hypothesis: str
-    :param reference: reference string
-    :type reference: str
-    :preprocess: preprocessing method (default str.lower)
-    :type preprocess: method
-    :return: enumerated words list
-    :rtype: list of 2D tuples, list of 2D tuples
-    """
-    hypothesis_list = list(enumerate(preprocess(hypothesis).split()))
-    reference_list = list(enumerate(preprocess(reference).split()))
-    return hypothesis_list, reference_list
-
-
-def exact_match(hypothesis, reference):
-    """
-    matches exact words in hypothesis and reference
-    and returns a word mapping based on the enumerated
-    word id between hypothesis and reference
-
-    :param hypothesis: hypothesis string
-    :type hypothesis: str
-    :param reference: reference string
-    :type reference: str
-    :return: enumerated matched tuples, enumerated unmatched hypothesis tuples,
-             enumerated unmatched reference tuples
-    :rtype: list of 2D tuples, list of 2D tuples,  list of 2D tuples
-    """
-    hypothesis_list, reference_list = _generate_enums(hypothesis, reference)
-    return _match_enums(hypothesis_list, reference_list)
-
-
-def _match_enums(enum_hypothesis_list, enum_reference_list):
-    """
-    matches exact words in hypothesis and reference and returns
-    a word mapping between enum_hypothesis_list and enum_reference_list
-    based on the enumerated word id.
-
-    :param enum_hypothesis_list: enumerated hypothesis list
-    :type enum_hypothesis_list: list of tuples
-    :param enum_reference_list: enumerated reference list
-    :type enum_reference_list: list of 2D tuples
-    :return: enumerated matched tuples, enumerated unmatched hypothesis tuples,
-             enumerated unmatched reference tuples
-    :rtype: list of 2D tuples, list of 2D tuples,  list of 2D tuples
-    """
-    word_match = []
-    for i in range(len(enum_hypothesis_list))[::-1]:
-        for j in range(len(enum_reference_list))[::-1]:
-            if enum_hypothesis_list[i][1] == enum_reference_list[j][1]:
-                word_match.append(
-                    (enum_hypothesis_list[i][0], enum_reference_list[j][0])
-                )
-                (enum_hypothesis_list.pop(i)[1], enum_reference_list.pop(j)[1])
-                break
-    return word_match, enum_hypothesis_list, enum_reference_list
-
-
-def _enum_stem_match(
-    enum_hypothesis_list, enum_reference_list, stemmer=PorterStemmer()
-):
-    """
-    Stems each word and matches them in hypothesis and reference
-    and returns a word mapping between enum_hypothesis_list and
-    enum_reference_list based on the enumerated word id. The function also
-    returns a enumerated list of unmatched words for hypothesis and reference.
-
-    :param enum_hypothesis_list:
-    :type enum_hypothesis_list:
-    :param enum_reference_list:
-    :type enum_reference_list:
-    :param stemmer: nltk.stem.api.StemmerI object (default PorterStemmer())
-    :type stemmer: nltk.stem.api.StemmerI or any class that implements a stem method
-    :return: enumerated matched tuples, enumerated unmatched hypothesis tuples,
-             enumerated unmatched reference tuples
-    :rtype: list of 2D tuples, list of 2D tuples,  list of 2D tuples
-    """
-    stemmed_enum_list1 = [
-        (word_pair[0], stemmer.stem(word_pair[1])) for word_pair in enum_hypothesis_list
-    ]
-
-    stemmed_enum_list2 = [
-        (word_pair[0], stemmer.stem(word_pair[1])) for word_pair in enum_reference_list
-    ]
-
-    word_match, enum_unmat_hypo_list, enum_unmat_ref_list = _match_enums(
-        stemmed_enum_list1, stemmed_enum_list2
-    )
-
-    enum_unmat_hypo_list = (
-        list(zip(*enum_unmat_hypo_list)) if len(enum_unmat_hypo_list) > 0 else []
-    )
-
-    enum_unmat_ref_list = (
-        list(zip(*enum_unmat_ref_list)) if len(enum_unmat_ref_list) > 0 else []
-    )
-
-    enum_hypothesis_list = list(
-        filter(lambda x: x[0] not in enum_unmat_hypo_list, enum_hypothesis_list)
-    )
-
-    enum_reference_list = list(
-        filter(lambda x: x[0] not in enum_unmat_ref_list, enum_reference_list)
-    )
-
-    return word_match, enum_hypothesis_list, enum_reference_list
-
-
-def stem_match(hypothesis, reference, stemmer=PorterStemmer()):
-    """
-    Stems each word and matches them in hypothesis and reference
-    and returns a word mapping between hypothesis and reference
-
-    :param hypothesis:
-    :type hypothesis:
-    :param reference:
-    :type reference:
-    :param stemmer: nltk.stem.api.StemmerI object (default PorterStemmer())
-    :type stemmer: nltk.stem.api.StemmerI or any class that
-                   implements a stem method
-    :return: enumerated matched tuples, enumerated unmatched hypothesis tuples,
-             enumerated unmatched reference tuples
-    :rtype: list of 2D tuples, list of 2D tuples,  list of 2D tuples
-    """
-    enum_hypothesis_list, enum_reference_list = _generate_enums(hypothesis, reference)
-    return _enum_stem_match(enum_hypothesis_list, enum_reference_list, stemmer=stemmer)
-
-
-def _enum_wordnetsyn_match(enum_hypothesis_list, enum_reference_list, wordnet=wordnet):
-    """
-    Matches each word in reference to a word in hypothesis
-    if any synonym of a hypothesis word is the exact match
-    to the reference word.
-
-    :param enum_hypothesis_list: enumerated hypothesis list
-    :param enum_reference_list: enumerated reference list
-    :param wordnet: a wordnet corpus reader object (default nltk.corpus.wordnet)
-    :type wordnet: WordNetCorpusReader
-    :return: list of matched tuples, unmatched hypothesis list, unmatched reference list
-    :rtype:  list of tuples, list of tuples, list of tuples
-
-    """
-    word_match = []
-    for i in range(len(enum_hypothesis_list))[::-1]:
-        hypothesis_syns = set(
-            chain(
-                *[
-                    [
-                        lemma.name()
-                        for lemma in synset.lemmas()
-                        if lemma.name().find("_") < 0
-                    ]
-                    for synset in wordnet.synsets(enum_hypothesis_list[i][1])
-                ]
-            )
-        ).union({enum_hypothesis_list[i][1]})
-        for j in range(len(enum_reference_list))[::-1]:
-            if enum_reference_list[j][1] in hypothesis_syns:
-                word_match.append(
-                    (enum_hypothesis_list[i][0], enum_reference_list[j][0])
-                )
-                enum_hypothesis_list.pop(i), enum_reference_list.pop(j)
-                break
-    return word_match, enum_hypothesis_list, enum_reference_list
-
-
-def wordnetsyn_match(hypothesis, reference, wordnet=wordnet):
-    """
-    Matches each word in reference to a word in hypothesis if any synonym
-    of a hypothesis word is the exact match to the reference word.
-
-    :param hypothesis: hypothesis string
-    :param reference: reference string
-    :param wordnet: a wordnet corpus reader object (default nltk.corpus.wordnet)
-    :type wordnet: WordNetCorpusReader
-    :return: list of mapped tuples
-    :rtype: list of tuples
-    """
-    enum_hypothesis_list, enum_reference_list = _generate_enums(hypothesis, reference)
-    return _enum_wordnetsyn_match(
-        enum_hypothesis_list, enum_reference_list, wordnet=wordnet
-    )
-
-
-def _enum_allign_words(
-    enum_hypothesis_list, enum_reference_list, stemmer=PorterStemmer(), wordnet=wordnet
-):
-    """
-    Aligns/matches words in the hypothesis to reference by sequentially
-    applying exact match, stemmed match and wordnet based synonym match.
-    in case there are multiple matches the match which has the least number
-    of crossing is chosen. Takes enumerated list as input instead of
-    string input
-
-    :param enum_hypothesis_list: enumerated hypothesis list
-    :param enum_reference_list: enumerated reference list
-    :param stemmer: nltk.stem.api.StemmerI object (default PorterStemmer())
-    :type stemmer: nltk.stem.api.StemmerI or any class that implements a stem method
-    :param wordnet: a wordnet corpus reader object (default nltk.corpus.wordnet)
-    :type wordnet: WordNetCorpusReader
-    :return: sorted list of matched tuples, unmatched hypothesis list,
-             unmatched reference list
-    :rtype: list of tuples, list of tuples, list of tuples
-    """
-    exact_matches, enum_hypothesis_list, enum_reference_list = _match_enums(
-        enum_hypothesis_list, enum_reference_list
-    )
-
-    stem_matches, enum_hypothesis_list, enum_reference_list = _enum_stem_match(
-        enum_hypothesis_list, enum_reference_list, stemmer=stemmer
-    )
-
-    wns_matches, enum_hypothesis_list, enum_reference_list = _enum_wordnetsyn_match(
-        enum_hypothesis_list, enum_reference_list, wordnet=wordnet
-    )
-
-    return (
-        sorted(
-            exact_matches + stem_matches + wns_matches, key=lambda wordpair: wordpair[0]
-        ),
-        enum_hypothesis_list,
-        enum_reference_list,
-    )
-
-
-def allign_words(hypothesis, reference, stemmer=PorterStemmer(), wordnet=wordnet):
-    """
-    Aligns/matches words in the hypothesis to reference by sequentially
-    applying exact match, stemmed match and wordnet based synonym match.
-    In case there are multiple matches the match which has the least number
-    of crossing is chosen.
-
-    :param hypothesis: hypothesis string
-    :param reference: reference string
-    :param stemmer: nltk.stem.api.StemmerI object (default PorterStemmer())
-    :type stemmer: nltk.stem.api.StemmerI or any class that implements a stem method
-    :param wordnet: a wordnet corpus reader object (default nltk.corpus.wordnet)
-    :type wordnet: WordNetCorpusReader
-    :return: sorted list of matched tuples, unmatched hypothesis list, unmatched reference list
-    :rtype: list of tuples, list of tuples, list of tuples
-    """
-    enum_hypothesis_list, enum_reference_list = _generate_enums(hypothesis, reference)
-    return _enum_allign_words(
-        enum_hypothesis_list, enum_reference_list, stemmer=stemmer, wordnet=wordnet
-    )
-
-
-def _count_chunks(matches):
-    """
-    Counts the fewest possible number of chunks such that matched unigrams
-    of each chunk are adjacent to each other. This is used to caluclate the
-    fragmentation part of the metric.
-
-    :param matches: list containing a mapping of matched words (output of allign_words)
-    :return: Number of chunks a sentence is divided into post allignment
-    :rtype: int
-    """
-    i = 0
-    chunks = 1
-    while i < len(matches) - 1:
-        if (matches[i + 1][0] == matches[i][0] + 1) and (
-            matches[i + 1][1] == matches[i][1] + 1
-        ):
-            i += 1
-            continue
-        i += 1
-        chunks += 1
-    return chunks
-
-
-def single_meteor_score(
-    reference,
-    hypothesis,
-    preprocess=str.lower,
-    stemmer=PorterStemmer(),
-    wordnet=wordnet,
-    alpha=0.9,
-    beta=3,
-    gamma=0.5,
-):
-    """
-    Calculates METEOR score for single hypothesis and reference as per
-    "Meteor: An Automatic Metric for MT Evaluation with HighLevels of
-    Correlation with Human Judgments" by Alon Lavie and Abhaya Agarwal,
-    in Proceedings of ACL.
-    http://www.cs.cmu.edu/~alavie/METEOR/pdf/Lavie-Agarwal-2007-METEOR.pdf
-
-
-    >>> hypothesis1 = 'It is a guide to action which ensures that the military always obeys the commands of the party'
-
-    >>> reference1 = 'It is a guide to action that ensures that the military will forever heed Party commands'
-
-
-    >>> round(single_meteor_score(reference1, hypothesis1),4)
-    0.7398
-
-        If there is no words match during the alignment the method returns the
-        score as 0. We can safely  return a zero instead of raising a
-        division by zero error as no match usually implies a bad translation.
-
-    >>> round(meteor_score('this is a cat', 'non matching hypothesis'),4)
-    0.0
-
-    :param references: reference sentences
-    :type references: list(str)
-    :param hypothesis: a hypothesis sentence
-    :type hypothesis: str
-    :param preprocess: preprocessing function (default str.lower)
-    :type preprocess: method
-    :param stemmer: nltk.stem.api.StemmerI object (default PorterStemmer())
-    :type stemmer: nltk.stem.api.StemmerI or any class that implements a stem method
-    :param wordnet: a wordnet corpus reader object (default nltk.corpus.wordnet)
-    :type wordnet: WordNetCorpusReader
-    :param alpha: parameter for controlling relative weights of precision and recall.
-    :type alpha: float
-    :param beta: parameter for controlling shape of penalty as a
-                 function of as a function of fragmentation.
-    :type beta: float
-    :param gamma: relative weight assigned to fragmentation penality.
-    :type gamma: float
-    :return: The sentence-level METEOR score.
-    :rtype: float
-    """
-    enum_hypothesis, enum_reference = _generate_enums(
-        hypothesis, reference, preprocess=preprocess
-    )
-    translation_length = len(enum_hypothesis)
-    reference_length = len(enum_reference)
-    matches, _, _ = _enum_allign_words(enum_hypothesis, enum_reference, stemmer=stemmer)
-    matches_count = len(matches)
-    try:
-        precision = float(matches_count) / translation_length
-        recall = float(matches_count) / reference_length
-        fmean = (precision * recall) / (alpha * precision + (1 - alpha) * recall)
-        chunk_count = float(_count_chunks(matches))
-        frag_frac = chunk_count / matches_count
-    except ZeroDivisionError:
-        return 0.0
-    penalty = gamma * frag_frac ** beta
-    return (1 - penalty) * fmean
-
-
-def meteor_score(
-    references,
-    hypothesis,
-    preprocess=str.lower,
-    stemmer=PorterStemmer(),
-    wordnet=wordnet,
-    alpha=0.9,
-    beta=3,
-    gamma=0.5,
-):
-    """
-    Calculates METEOR score for hypothesis with multiple references as
-    described in "Meteor: An Automatic Metric for MT Evaluation with
-    HighLevels of Correlation with Human Judgments" by Alon Lavie and
-    Abhaya Agarwal, in Proceedings of ACL.
-    http://www.cs.cmu.edu/~alavie/METEOR/pdf/Lavie-Agarwal-2007-METEOR.pdf
-
-
-    In case of multiple references the best score is chosen. This method
-    iterates over single_meteor_score and picks the best pair among all
-    the references for a given hypothesis
-
-    >>> hypothesis1 = 'It is a guide to action which ensures that the military always obeys the commands of the party'
-    >>> hypothesis2 = 'It is to insure the troops forever hearing the activity guidebook that party direct'
-
-    >>> reference1 = 'It is a guide to action that ensures that the military will forever heed Party commands'
-    >>> reference2 = 'It is the guiding principle which guarantees the military forces always being under the command of the Party'
-    >>> reference3 = 'It is the practical guide for the army always to heed the directions of the party'
-
-    >>> round(meteor_score([reference1, reference2, reference3], hypothesis1),4)
-    0.7398
-
-        If there is no words match during the alignment the method returns the
-        score as 0. We can safely  return a zero instead of raising a
-        division by zero error as no match usually implies a bad translation.
-
-    >>> round(meteor_score(['this is a cat'], 'non matching hypothesis'),4)
-    0.0
-
-    :param references: reference sentences
-    :type references: list(str)
-    :param hypothesis: a hypothesis sentence
-    :type hypothesis: str
-    :param preprocess: preprocessing function (default str.lower)
-    :type preprocess: method
-    :param stemmer: nltk.stem.api.StemmerI object (default PorterStemmer())
-    :type stemmer: nltk.stem.api.StemmerI or any class that implements a stem method
-    :param wordnet: a wordnet corpus reader object (default nltk.corpus.wordnet)
-    :type wordnet: WordNetCorpusReader
-    :param alpha: parameter for controlling relative weights of precision and recall.
-    :type alpha: float
-    :param beta: parameter for controlling shape of penalty as a function
-                 of as a function of fragmentation.
-    :type beta: float
-    :param gamma: relative weight assigned to fragmentation penality.
-    :type gamma: float
-    :return: The sentence-level METEOR score.
-    :rtype: float
-    """
-    return max(
-        [
-            single_meteor_score(
-                reference,
-                hypothesis,
-                stemmer=stemmer,
-                wordnet=wordnet,
-                alpha=alpha,
-                beta=beta,
-                gamma=gamma,
-            )
-            for reference in references
-        ]
-    )
index d11addb..a984f96 100644 (file)
@@ -1,11 +1,12 @@
 # Natural Language Toolkit: Translation metrics
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Will Zhang <wilzzha@gmail.com>
 #         Guan Gui <ggui@student.unimelb.edu.au>
 #         Steven Bird <stevenbird1@gmail.com>
 # URL: <http://nltk.org/>
 # For license information, see LICENSE.TXT
+from __future__ import division
 
 
 def alignment_error_rate(reference, hypothesis, possible=None):
index ca9ac2b..57b2074 100644 (file)
@@ -1,13 +1,14 @@
 # -*- coding: utf-8 -*-
 # Natural Language Toolkit: NIST Score
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Authors:
 # Contributors:
 # URL: <http://nltk.org/>
 # For license information, see LICENSE.TXT
 
 """NIST score implementation."""
+from __future__ import division
 
 import math
 import fractions
index a50887e..df2ba2d 100644 (file)
@@ -1,7 +1,7 @@
 # -*- coding: utf-8 -*-
 # Natural Language Toolkit: Phrase Extraction Algorithm
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Authors: Liling Tan, Fredrik Hedman, Petra Barancikova
 # URL: <http://nltk.org/>
 # For license information, see LICENSE.TXT
@@ -41,11 +41,11 @@ def extract(
     :type f_start: int
     :param f_start: Starting index of the possible foreign language phrases
     :type f_end: int
-    :param f_end: End index of the possible foreign language phrases
+    :param f_end: Starting index of the possible foreign language phrases
     :type e_start: int
     :param e_start: Starting index of the possible source language phrases
     :type e_end: int
-    :param e_end: End index of the possible source language phrases
+    :param e_end: Starting index of the possible source language phrases
     :type srctext: list
     :param srctext: The source language tokens, a list of string.
     :type trgtext: list
@@ -75,10 +75,10 @@ def extract(
             trg_phrase = " ".join(trgtext[fs : fe + 1])
             # Include more data for later ordering.
             phrases.add(
-                ((e_start, e_end + 1), (fs, fe + 1), src_phrase, trg_phrase)
+                ((e_start, e_end + 1), (f_start, f_end + 1), src_phrase, trg_phrase)
             )
             fe += 1
-            if fe in f_aligned or fe >= trglen:
+            if fe in f_aligned or fe == trglen:
                 break
         fs -= 1
         if fs in f_aligned or fs < 0:
@@ -111,20 +111,20 @@ def phrase_extraction(srctext, trgtext, alignment, max_phrase_length=0):
     ...
     ((0, 1), (0, 1), 'michael', 'michael')
     ((0, 2), (0, 4), 'michael assumes', 'michael geht davon aus')
-    ((0, 2), (0, 5), 'michael assumes', 'michael geht davon aus ,')
+    ((0, 2), (0, 4), 'michael assumes', 'michael geht davon aus ,')
     ((0, 3), (0, 6), 'michael assumes that', 'michael geht davon aus , dass')
     ((0, 4), (0, 7), 'michael assumes that he', 'michael geht davon aus , dass er')
     ((0, 9), (0, 10), 'michael assumes that he will stay in the house', 'michael geht davon aus , dass er im haus bleibt')
     ((1, 2), (1, 4), 'assumes', 'geht davon aus')
-    ((1, 2), (1, 5), 'assumes', 'geht davon aus ,')
+    ((1, 2), (1, 4), 'assumes', 'geht davon aus ,')
     ((1, 3), (1, 6), 'assumes that', 'geht davon aus , dass')
     ((1, 4), (1, 7), 'assumes that he', 'geht davon aus , dass er')
     ((1, 9), (1, 10), 'assumes that he will stay in the house', 'geht davon aus , dass er im haus bleibt')
-    ((2, 3), (4, 6), 'that', ', dass')
+    ((2, 3), (5, 6), 'that', ', dass')
     ((2, 3), (5, 6), 'that', 'dass')
-    ((2, 4), (4, 7), 'that he', ', dass er')
+    ((2, 4), (5, 7), 'that he', ', dass er')
     ((2, 4), (5, 7), 'that he', 'dass er')
-    ((2, 9), (4, 10), 'that he will stay in the house', ', dass er im haus bleibt')
+    ((2, 9), (5, 10), 'that he will stay in the house', ', dass er im haus bleibt')
     ((2, 9), (5, 10), 'that he will stay in the house', 'dass er im haus bleibt')
     ((3, 4), (6, 7), 'he', 'er')
     ((3, 9), (6, 10), 'he will stay in the house', 'er im haus bleibt')
@@ -138,7 +138,7 @@ def phrase_extraction(srctext, trgtext, alignment, max_phrase_length=0):
     :param srctext: The sentence string from the source language.
     :type trgtext: str
     :param trgtext: The sentence string from the target language.
-    :type alignment: list(tuple)
+    :type alignment: str
     :param alignment: The word alignment outputs as list of tuples, where
         the first elements of tuples are the source words' indices and
         second elements are the target words' indices. This is also the output
index 912084f..fac42b0 100644 (file)
@@ -1,13 +1,13 @@
 # -*- coding: utf-8 -*-
 # Natural Language Toolkit: RIBES Score
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Contributors: Katsuhito Sudoh, Liling Tan, Kasramvd, J.F.Sebastian
 #               Mark Byers, ekhumoro, P. Ortiz
 # URL: <http://nltk.org/>
 # For license information, see LICENSE.TXT
 """ RIBES score implementation """
-
+from __future__ import division
 from itertools import islice
 import math
 
index af0ce7e..2b4194a 100644 (file)
@@ -1,7 +1,7 @@
 # -*- coding: utf-8 -*-
 # Natural Language Toolkit: Stack decoder
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Tah Wei Hoon <hoon.tw@gmail.com>
 # URL: <http://nltk.org/>
 # For license information, see LICENSE.TXT
@@ -188,9 +188,9 @@ class StackDecoder(object):
 
         if not stacks[sentence_length]:
             warnings.warn(
-                "Unable to translate all words. "
-                "The source sentence contains words not in "
-                "the phrase table"
+                'Unable to translate all words. '
+                'The source sentence contains words not in '
+                'the phrase table'
             )
             # Instead of returning empty output, perhaps a partial
             # translation could be returned
@@ -238,7 +238,7 @@ class StackDecoder(object):
         subsequence covering positions 2, 3, and 4.
         :rtype: dict(int: (dict(int): float))
         """
-        scores = defaultdict(lambda: defaultdict(lambda: float("-inf")))
+        scores = defaultdict(lambda: defaultdict(lambda: float('-inf')))
         for seq_length in range(1, len(src_sentence) + 1):
             for start in range(0, len(src_sentence) - seq_length + 1):
                 end = start + seq_length
@@ -466,7 +466,7 @@ class _Stack(object):
         self.items = []
 
         if beam_threshold == 0.0:
-            self.__log_beam_threshold = float("-inf")
+            self.__log_beam_threshold = float('-inf')
         else:
             self.__log_beam_threshold = log(beam_threshold)
 
index 1614c45..9f79355 100644 (file)
@@ -1,7 +1,7 @@
 # -*- coding: utf-8 -*-
 # Natural Language Toolkit: Text Trees
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Edward Loper <edloper@gmail.com>
 #         Steven Bird <stevenbird1@gmail.com>
 #         Peter Ljunglöf <peter.ljunglof@gu.se>
 Class for representing hierarchical language structures, such as
 syntax trees and morphological trees.
 """
+from __future__ import print_function, unicode_literals
 
 import re
-import sys
 from abc import ABCMeta, abstractmethod
 
+from six import string_types, add_metaclass
 
 from nltk.grammar import Production, Nonterminal
 from nltk.probability import ProbabilisticMixIn
 from nltk.util import slice_bounds
+from nltk.compat import python_2_unicode_compatible, unicode_repr
 from nltk.internals import raise_unorderable_types
 
 # TODO: add LabelledTree (can be used for dependency trees)
@@ -31,7 +33,7 @@ from nltk.internals import raise_unorderable_types
 ######################################################################
 
 
-
+@python_2_unicode_compatible
 class Tree(list):
     """
     A Tree represents a hierarchical grouping of leaves and subtrees.
@@ -101,7 +103,7 @@ class Tree(list):
             raise TypeError(
                 "%s: Expected a node value and child list " % type(self).__name__
             )
-        elif isinstance(children, str):
+        elif isinstance(children, string_types):
             raise TypeError(
                 "%s() argument 2 should be a list, not a "
                 "string" % type(self).__name__
@@ -142,16 +144,16 @@ class Tree(list):
     # ////////////////////////////////////////////////////////////
 
     def __mul__(self, v):
-        raise TypeError("Tree does not support multiplication")
+        raise TypeError('Tree does not support multiplication')
 
     def __rmul__(self, v):
-        raise TypeError("Tree does not support multiplication")
+        raise TypeError('Tree does not support multiplication')
 
     def __add__(self, v):
-        raise TypeError("Tree does not support addition")
+        raise TypeError('Tree does not support addition')
 
     def __radd__(self, v):
-        raise TypeError("Tree does not support addition")
+        raise TypeError('Tree does not support addition')
 
     # ////////////////////////////////////////////////////////////
     # Indexing (with support for tree positions)
@@ -178,7 +180,7 @@ class Tree(list):
             return list.__setitem__(self, index, value)
         elif isinstance(index, (list, tuple)):
             if len(index) == 0:
-                raise IndexError("The tree position () may not be " "assigned to.")
+                raise IndexError('The tree position () may not be ' 'assigned to.')
             elif len(index) == 1:
                 self[index[0]] = value
             else:
@@ -194,7 +196,7 @@ class Tree(list):
             return list.__delitem__(self, index)
         elif isinstance(index, (list, tuple)):
             if len(index) == 0:
-                raise IndexError("The tree position () may not be deleted.")
+                raise IndexError('The tree position () may not be deleted.')
             elif len(index) == 1:
                 del self[index[0]]
             else:
@@ -308,7 +310,7 @@ class Tree(list):
                 max_child_height = max(max_child_height, 1)
         return 1 + max_child_height
 
-    def treepositions(self, order="preorder"):
+    def treepositions(self, order='preorder'):
         """
             >>> t = Tree.fromstring("(S (NP (D the) (N dog)) (VP (V chased) (NP (D the) (N cat))))")
             >>> t.treepositions() # doctest: +ELLIPSIS
@@ -322,7 +324,7 @@ class Tree(list):
             ``leaves``.
         """
         positions = []
-        if order in ("preorder", "bothorder"):
+        if order in ('preorder', 'bothorder'):
             positions.append(())
         for i, child in enumerate(self):
             if isinstance(child, Tree):
@@ -330,7 +332,7 @@ class Tree(list):
                 positions.extend((i,) + p for p in childpos)
             else:
                 positions.append((i,))
-        if order in ("postorder", "bothorder"):
+        if order in ('postorder', 'bothorder'):
             positions.append(())
         return positions
 
@@ -372,9 +374,9 @@ class Tree(list):
         :rtype: list(Production)
         """
 
-        if not isinstance(self._label, str):
+        if not isinstance(self._label, string_types):
             raise TypeError(
-                "Productions can only be generated from trees having node labels that are strings"
+                'Productions can only be generated from trees having node labels that are strings'
             )
 
         prods = [Production(Nonterminal(self._label), _child_names(self))]
@@ -413,7 +415,7 @@ class Tree(list):
             leaves, or if ``index<0``.
         """
         if index < 0:
-            raise IndexError("index must be non-negative")
+            raise IndexError('index must be non-negative')
 
         stack = [(self, ())]
         while stack:
@@ -427,7 +429,7 @@ class Tree(list):
                 for i in range(len(value) - 1, -1, -1):
                     stack.append((value[i], treepos + (i,)))
 
-        raise IndexError("index must be less than or equal to len(self)")
+        raise IndexError('index must be less than or equal to len(self)')
 
     def treeposition_spanning_leaves(self, start, end):
         """
@@ -436,7 +438,7 @@ class Tree(list):
         :raise ValueError: if ``end <= start``
         """
         if end <= start:
-            raise ValueError("end must be greater than start")
+            raise ValueError('end must be greater than start')
         # Find the tree positions of the start & end leaves, and
         # take the longest common subsequence.
         start_treepos = self.leaf_treeposition(start)
@@ -554,12 +556,6 @@ class Tree(list):
         else:
             return tree
 
-    def __copy__(self):
-        return self.copy()
-
-    def __deepcopy__(self, memo):
-        return self.copy(deep=True)
-
     def copy(self, deep=False):
         if not deep:
             return type(self)(self._label, self)
@@ -575,7 +571,7 @@ class Tree(list):
             newcopy = frozen_class.convert(self)
         else:
             newcopy = self.copy(deep=True)
-            for pos in newcopy.treepositions("leaves"):
+            for pos in newcopy.treepositions('leaves'):
                 newcopy[pos] = leaf_freezer(newcopy[pos])
             newcopy = frozen_class.convert(newcopy)
         hash(newcopy)  # Make sure the leaves are hashable.
@@ -589,7 +585,7 @@ class Tree(list):
     def fromstring(
         cls,
         s,
-        brackets="()",
+        brackets='()',
         read_node=None,
         read_leaf=None,
         node_pattern=None,
@@ -645,19 +641,19 @@ class Tree(list):
             then it will return a tree of that type.
         :rtype: Tree
         """
-        if not isinstance(brackets, str) or len(brackets) != 2:
-            raise TypeError("brackets must be a length-2 string")
-        if re.search("\s", brackets):
-            raise TypeError("whitespace brackets not allowed")
+        if not isinstance(brackets, string_types) or len(brackets) != 2:
+            raise TypeError('brackets must be a length-2 string')
+        if re.search('\s', brackets):
+            raise TypeError('whitespace brackets not allowed')
         # Construct a regexp that will tokenize the string.
         open_b, close_b = brackets
         open_pattern, close_pattern = (re.escape(open_b), re.escape(close_b))
         if node_pattern is None:
-            node_pattern = "[^\s%s%s]+" % (open_pattern, close_pattern)
+            node_pattern = '[^\s%s%s]+' % (open_pattern, close_pattern)
         if leaf_pattern is None:
-            leaf_pattern = "[^\s%s%s]+" % (open_pattern, close_pattern)
+            leaf_pattern = '[^\s%s%s]+' % (open_pattern, close_pattern)
         token_re = re.compile(
-            "%s\s*(%s)?|%s|(%s)"
+            '%s\s*(%s)?|%s|(%s)'
             % (open_pattern, node_pattern, close_pattern, leaf_pattern)
         )
         # Walk through each token, updating a stack of trees.
@@ -667,7 +663,7 @@ class Tree(list):
             # Beginning of a tree/subtree
             if token[0] == open_b:
                 if len(stack) == 1 and len(stack[0][1]) > 0:
-                    cls._parse_error(s, match, "end-of-string")
+                    cls._parse_error(s, match, 'end-of-string')
                 label = token[1:].lstrip()
                 if read_node is not None:
                     label = read_node(label)
@@ -678,7 +674,7 @@ class Tree(list):
                     if len(stack[0][1]) == 0:
                         cls._parse_error(s, match, open_b)
                     else:
-                        cls._parse_error(s, match, "end-of-string")
+                        cls._parse_error(s, match, 'end-of-string')
                 label, children = stack.pop()
                 stack[-1][1].append(cls(label, children))
             # Leaf node
@@ -691,9 +687,9 @@ class Tree(list):
 
         # check that we got exactly one complete tree.
         if len(stack) > 1:
-            cls._parse_error(s, "end-of-string", close_b)
+            cls._parse_error(s, 'end-of-string', close_b)
         elif len(stack[0][1]) == 0:
-            cls._parse_error(s, "end-of-string", open_b)
+            cls._parse_error(s, 'end-of-string', open_b)
         else:
             assert stack[0][0] is None
             assert len(stack[0][1]) == 1
@@ -701,7 +697,7 @@ class Tree(list):
 
         # If the tree has an extra level with node='', then get rid of
         # it.  E.g.: "((S (NP ...) (VP ...)))"
-        if remove_empty_top_bracketing and tree._label == "" and len(tree) == 1:
+        if remove_empty_top_bracketing and tree._label == '' and len(tree) == 1:
             tree = tree[0]
         # return the tree.
         return tree
@@ -715,26 +711,26 @@ class Tree(list):
         :param expecting: what we expected to see instead.
         """
         # Construct a basic error message
-        if match == "end-of-string":
-            pos, token = len(s), "end-of-string"
+        if match == 'end-of-string':
+            pos, token = len(s), 'end-of-string'
         else:
             pos, token = match.start(), match.group()
-        msg = "%s.read(): expected %r but got %r\n%sat index %d." % (
+        msg = '%s.read(): expected %r but got %r\n%sat index %d.' % (
             cls.__name__,
             expecting,
             token,
-            " " * 12,
+            ' ' * 12,
             pos,
         )
         # Add a display showing the error token itsels:
-        s = s.replace("\n", " ").replace("\t", " ")
+        s = s.replace('\n', ' ').replace('\t', ' ')
         offset = pos
         if len(s) > pos + 10:
-            s = s[: pos + 10] + "..."
+            s = s[: pos + 10] + '...'
         if pos > 10:
-            s = "..." + s[pos - 10 :]
+            s = '...' + s[pos - 10 :]
             offset = 13
-        msg += '\n%s"%s"\n%s^' % (" " * 16, s, " " * (17 + offset))
+        msg += '\n%s"%s"\n%s^' % (' ' * 16, s, ' ' * (17 + offset))
         raise ValueError(msg)
 
     # ////////////////////////////////////////////////////////////
@@ -760,10 +756,10 @@ class Tree(list):
         print(TreePrettyPrinter(self, sentence, highlight).text(**kwargs), file=stream)
 
     def __repr__(self):
-        childstr = ", ".join(repr(c) for c in self)
-        return "%s(%s, [%s])" % (
+        childstr = ", ".join(unicode_repr(c) for c in self)
+        return '%s(%s, [%s])' % (
             type(self).__name__,
-            repr(self._label),
+            unicode_repr(self._label),
             childstr,
         )
 
@@ -786,37 +782,26 @@ class Tree(list):
         _canvas_frame.add_widget(widget)
         x, y, w, h = widget.bbox()
         # print_to_file uses scrollregion to set the width and height of the pdf.
-        _canvas_frame.canvas()["scrollregion"] = (0, 0, w, h)
+        _canvas_frame.canvas()['scrollregion'] = (0, 0, w, h)
         with tempfile.NamedTemporaryFile() as file:
-            in_path = "{0:}.ps".format(file.name)
-            out_path = "{0:}.png".format(file.name)
+            in_path = '{0:}.ps'.format(file.name)
+            out_path = '{0:}.png'.format(file.name)
             _canvas_frame.print_to_file(in_path)
             _canvas_frame.destroy_widget(widget)
-            try:
-                subprocess.call(
-                    [
-                        find_binary(
-                            "gs",
-                            binary_names=["gswin32c.exe", "gswin64c.exe"],
-                            env_vars=["PATH"],
-                            verbose=False,
-                        )
-                    ]
-                    + "-q -dEPSCrop -sDEVICE=png16m -r90 -dTextAlphaBits=4 -dGraphicsAlphaBits=4 -dSAFER -dBATCH -dNOPAUSE -sOutputFile={0:} {1:}".format(
-                        out_path, in_path
-                    ).split()
-                )
-            except LookupError:
-                pre_error_message = str(
-                    "The Ghostscript executable isn't found.\n"
-                    "See http://web.mit.edu/ghostscript/www/Install.htm\n"
-                    "If you're using a Mac, you can try installing\n"
-                    "https://docs.brew.sh/Installation then `brew install ghostscript`"
-                )
-                print(pre_error_message, file=sys.stderr)
-                raise LookupError
-
-            with open(out_path, "rb") as sr:
+            subprocess.call(
+                [
+                    find_binary(
+                        'gs',
+                        binary_names=['gswin32c.exe', 'gswin64c.exe'],
+                        env_vars=['PATH'],
+                        verbose=False,
+                    )
+                ]
+                + '-q -dEPSCrop -sDEVICE=png16m -r90 -dTextAlphaBits=4 -dGraphicsAlphaBits=4 -dSAFER -dBATCH -dNOPAUSE -sOutputFile={0:} {1:}'.format(
+                    out_path, in_path
+                ).split()
+            )
+            with open(out_path, 'rb') as sr:
                 res = sr.read()
             os.remove(in_path)
             os.remove(out_path)
@@ -837,7 +822,7 @@ class Tree(list):
             stream = None
         print(self.pformat(**kwargs), file=stream)
 
-    def pformat(self, margin=70, indent=0, nodesep="", parens="()", quotes=False):
+    def pformat(self, margin=70, indent=0, nodesep='', parens='()', quotes=False):
         """
         :return: A pretty-printed string representation of this tree.
         :rtype: str
@@ -858,23 +843,23 @@ class Tree(list):
             return s
 
         # If it doesn't fit on one line, then write it on multi-lines.
-        if isinstance(self._label, str):
-            s = "%s%s%s" % (parens[0], self._label, nodesep)
+        if isinstance(self._label, string_types):
+            s = '%s%s%s' % (parens[0], self._label, nodesep)
         else:
-            s = "%s%s%s" % (parens[0], repr(self._label), nodesep)
+            s = '%s%s%s' % (parens[0], unicode_repr(self._label), nodesep)
         for child in self:
             if isinstance(child, Tree):
                 s += (
-                    "\n"
-                    + " " * (indent + 2)
+                    '\n'
+                    + ' ' * (indent + 2)
                     + child.pformat(margin, indent + 2, nodesep, parens, quotes)
                 )
             elif isinstance(child, tuple):
-                s += "\n" + " " * (indent + 2) + "/".join(child)
-            elif isinstance(child, str) and not quotes:
-                s += "\n" + " " * (indent + 2) + "%s" % child
+                s += '\n' + ' ' * (indent + 2) + "/".join(child)
+            elif isinstance(child, string_types) and not quotes:
+                s += '\n' + ' ' * (indent + 2) + '%s' % child
             else:
-                s += "\n" + " " * (indent + 2) + repr(child)
+                s += '\n' + ' ' * (indent + 2) + unicode_repr(child)
         return s + parens[1]
 
     def pformat_latex_qtree(self):
@@ -895,10 +880,10 @@ class Tree(list):
         :return: A latex qtree representation of this tree.
         :rtype: str
         """
-        reserved_chars = re.compile("([#\$%&~_\{\}])")
+        reserved_chars = re.compile('([#\$%&~_\{\}])')
 
-        pformat = self.pformat(indent=6, nodesep="", parens=("[.", " ]"))
-        return r"\Tree " + re.sub(reserved_chars, r"\\\1", pformat)
+        pformat = self.pformat(indent=6, nodesep='', parens=('[.', ' ]'))
+        return r'\Tree ' + re.sub(reserved_chars, r'\\\1', pformat)
 
     def _pformat_flat(self, nodesep, parens, quotes):
         childstrs = []
@@ -907,12 +892,12 @@ class Tree(list):
                 childstrs.append(child._pformat_flat(nodesep, parens, quotes))
             elif isinstance(child, tuple):
                 childstrs.append("/".join(child))
-            elif isinstance(child, str) and not quotes:
-                childstrs.append("%s" % child)
+            elif isinstance(child, string_types) and not quotes:
+                childstrs.append('%s' % child)
             else:
-                childstrs.append(repr(child))
-        if isinstance(self._label, str):
-            return "%s%s%s %s%s" % (
+                childstrs.append(unicode_repr(child))
+        if isinstance(self._label, string_types):
+            return '%s%s%s %s%s' % (
                 parens[0],
                 self._label,
                 nodesep,
@@ -920,9 +905,9 @@ class Tree(list):
                 parens[1],
             )
         else:
-            return "%s%s%s %s%s" % (
+            return '%s%s%s %s%s' % (
                 parens[0],
-                repr(self._label),
+                unicode_repr(self._label),
                 nodesep,
                 " ".join(childstrs),
                 parens[1],
@@ -942,40 +927,40 @@ class ImmutableTree(Tree):
             )
 
     def __setitem__(self, index, value):
-        raise ValueError("%s may not be modified" % type(self).__name__)
+        raise ValueError('%s may not be modified' % type(self).__name__)
 
     def __setslice__(self, i, j, value):
-        raise ValueError("%s may not be modified" % type(self).__name__)
+        raise ValueError('%s may not be modified' % type(self).__name__)
 
     def __delitem__(self, index):
-        raise ValueError("%s may not be modified" % type(self).__name__)
+        raise ValueError('%s may not be modified' % type(self).__name__)
 
     def __delslice__(self, i, j):
-        raise ValueError("%s may not be modified" % type(self).__name__)
+        raise ValueError('%s may not be modified' % type(self).__name__)
 
     def __iadd__(self, other):
-        raise ValueError("%s may not be modified" % type(self).__name__)
+        raise ValueError('%s may not be modified' % type(self).__name__)
 
     def __imul__(self, other):
-        raise ValueError("%s may not be modified" % type(self).__name__)
+        raise ValueError('%s may not be modified' % type(self).__name__)
 
     def append(self, v):
-        raise ValueError("%s may not be modified" % type(self).__name__)
+        raise ValueError('%s may not be modified' % type(self).__name__)
 
     def extend(self, v):
-        raise ValueError("%s may not be modified" % type(self).__name__)
+        raise ValueError('%s may not be modified' % type(self).__name__)
 
     def pop(self, v=None):
-        raise ValueError("%s may not be modified" % type(self).__name__)
+        raise ValueError('%s may not be modified' % type(self).__name__)
 
     def remove(self, v):
-        raise ValueError("%s may not be modified" % type(self).__name__)
+        raise ValueError('%s may not be modified' % type(self).__name__)
 
     def reverse(self):
-        raise ValueError("%s may not be modified" % type(self).__name__)
+        raise ValueError('%s may not be modified' % type(self).__name__)
 
     def sort(self):
-        raise ValueError("%s may not be modified" % type(self).__name__)
+        raise ValueError('%s may not be modified' % type(self).__name__)
 
     def __hash__(self):
         return self._hash
@@ -985,15 +970,16 @@ class ImmutableTree(Tree):
         Set the node label.  This will only succeed the first time the
         node label is set, which should occur in ImmutableTree.__init__().
         """
-        if hasattr(self, "_label"):
-            raise ValueError("%s may not be modified" % type(self).__name__)
+        if hasattr(self, '_label'):
+            raise ValueError('%s may not be modified' % type(self).__name__)
         self._label = value
 
 
 ######################################################################
 ## Parented trees
 ######################################################################
-class AbstractParentedTree(Tree, metaclass=ABCMeta):
+@add_metaclass(ABCMeta)
+class AbstractParentedTree(Tree):
     """
     An abstract base class for a ``Tree`` that automatically maintains
     pointers to parent nodes.  These parent pointers are updated
@@ -1094,7 +1080,7 @@ class AbstractParentedTree(Tree, metaclass=ABCMeta):
             if index < 0:
                 index += len(self)
             if index < 0:
-                raise IndexError("index out of range")
+                raise IndexError('index out of range')
             # Clear the child's parent pointer.
             if isinstance(self[index], Tree):
                 self._delparent(self[index], index)
@@ -1104,7 +1090,7 @@ class AbstractParentedTree(Tree, metaclass=ABCMeta):
         elif isinstance(index, (list, tuple)):
             # del ptree[()]
             if len(index) == 0:
-                raise IndexError("The tree position () may not be deleted.")
+                raise IndexError('The tree position () may not be deleted.')
             # del ptree[(i,)]
             elif len(index) == 1:
                 del self[index[0]]
@@ -1148,7 +1134,7 @@ class AbstractParentedTree(Tree, metaclass=ABCMeta):
             if index < 0:
                 index += len(self)
             if index < 0:
-                raise IndexError("index out of range")
+                raise IndexError('index out of range')
             # if the value is not changing, do nothing.
             if value is self[index]:
                 return
@@ -1164,7 +1150,7 @@ class AbstractParentedTree(Tree, metaclass=ABCMeta):
         elif isinstance(index, (list, tuple)):
             # ptree[()] = value
             if len(index) == 0:
-                raise IndexError("The tree position () may not be assigned to.")
+                raise IndexError('The tree position () may not be assigned to.')
             # ptree[(i,)] = value
             elif len(index) == 1:
                 self[index[0]] = value
@@ -1206,7 +1192,7 @@ class AbstractParentedTree(Tree, metaclass=ABCMeta):
         if index < 0:
             index += len(self)
         if index < 0:
-            raise IndexError("index out of range")
+            raise IndexError('index out of range')
         if isinstance(self[index], Tree):
             self._delparent(self[index], index)
         return super(AbstractParentedTree, self).pop(index)
@@ -1225,7 +1211,7 @@ class AbstractParentedTree(Tree, metaclass=ABCMeta):
     # __getitem__ etc., but use max(0, start) and max(0, stop) because
     # because negative indices are already handled *before*
     # __getslice__ is called; and we don't want to double-count them.
-    if hasattr(list, "__getslice__"):
+    if hasattr(list, '__getslice__'):
 
         def __getslice__(self, start, stop):
             return self.__getitem__(slice(max(0, start), max(0, stop)))
@@ -1293,7 +1279,7 @@ class ParentedTree(AbstractParentedTree):
         for i, child in enumerate(self._parent):
             if child is self:
                 return i
-        assert False, "expected to find self in self._parent!"
+        assert False, 'expected to find self in self._parent!'
 
     def left_sibling(self):
         """The left sibling of this tree, or None if it has none."""
@@ -1347,12 +1333,12 @@ class ParentedTree(AbstractParentedTree):
         # If the child's type is incorrect, then complain.
         if not isinstance(child, ParentedTree):
             raise TypeError(
-                "Can not insert a non-ParentedTree " + "into a ParentedTree"
+                'Can not insert a non-ParentedTree ' + 'into a ParentedTree'
             )
 
         # If child already has a parent, then complain.
         if child._parent is not None:
-            raise ValueError("Can not insert a subtree that already " "has a parent.")
+            raise ValueError('Can not insert a subtree that already ' 'has a parent.')
 
         # Set child's parent pointer & index.
         if not dry_run:
@@ -1526,7 +1512,7 @@ class MultiParentedTree(AbstractParentedTree):
         # If the child's type is incorrect, then complain.
         if not isinstance(child, MultiParentedTree):
             raise TypeError(
-                "Can not insert a non-MultiParentedTree " + "into a MultiParentedTree"
+                'Can not insert a non-MultiParentedTree ' + 'into a MultiParentedTree'
             )
 
         # Add self as a parent pointer if it's not already listed.
@@ -1551,7 +1537,7 @@ class ImmutableMultiParentedTree(ImmutableTree, MultiParentedTree):
 ######################################################################
 
 
-
+@python_2_unicode_compatible
 class ProbabilisticTree(Tree, ProbabilisticMixIn):
     def __init__(self, node, children=None, **prob_kwargs):
         Tree.__init__(self, node, children)
@@ -1562,10 +1548,10 @@ class ProbabilisticTree(Tree, ProbabilisticMixIn):
         return ImmutableProbabilisticTree
 
     def __repr__(self):
-        return "%s (p=%r)" % (Tree.__repr__(self), self.prob())
+        return '%s (p=%r)' % (Tree.unicode_repr(self), self.prob())
 
     def __str__(self):
-        return "%s (p=%.6g)" % (self.pformat(margin=60), self.prob())
+        return '%s (p=%.6g)' % (self.pformat(margin=60), self.prob())
 
     def copy(self, deep=False):
         if not deep:
@@ -1604,7 +1590,7 @@ class ProbabilisticTree(Tree, ProbabilisticMixIn):
             return self.__class__.__name__ < other.__class__.__name__
 
 
-
+@python_2_unicode_compatible
 class ImmutableProbabilisticTree(ImmutableTree, ProbabilisticMixIn):
     def __init__(self, node, children=None, **prob_kwargs):
         ImmutableTree.__init__(self, node, children)
@@ -1616,10 +1602,10 @@ class ImmutableProbabilisticTree(ImmutableTree, ProbabilisticMixIn):
         return ImmutableProbabilisticTree
 
     def __repr__(self):
-        return "%s [%s]" % (Tree.__repr__(self), self.prob())
+        return '%s [%s]' % (Tree.unicode_repr(self), self.prob())
 
     def __str__(self):
-        return "%s [%s]" % (self.pformat(margin=60), self.prob())
+        return '%s [%s]' % (self.pformat(margin=60), self.prob())
 
     def copy(self, deep=False):
         if not deep:
@@ -1672,21 +1658,21 @@ def sinica_parse(s):
     :param s: The string to be converted
     :type s: str
     """
-    tokens = re.split(r"([()| ])", s)
+    tokens = re.split(r'([()| ])', s)
     for i in range(len(tokens)):
-        if tokens[i] == "(":
+        if tokens[i] == '(':
             tokens[i - 1], tokens[i] = (
                 tokens[i],
                 tokens[i - 1],
             )  # pull nonterminal inside parens
-        elif ":" in tokens[i]:
-            fields = tokens[i].split(":")
+        elif ':' in tokens[i]:
+            fields = tokens[i].split(':')
             if len(fields) == 2:  # non-terminal
                 tokens[i] = fields[1]
             else:
                 tokens[i] = "(" + fields[-2] + " " + fields[-1] + ")"
-        elif tokens[i] == "|":
-            tokens[i] = ""
+        elif tokens[i] == '|':
+            tokens[i] = ''
 
     treebank_string = " ".join(tokens)
     return Tree.fromstring(treebank_string, remove_empty_top_bracketing=True)
@@ -1713,7 +1699,7 @@ def demo():
     from nltk import Tree, ProbabilisticTree
 
     # Demonstrate tree parsing.
-    s = "(S (NP (DT the) (NN cat)) (VP (VBD ate) (NP (DT a) (NN cookie))))"
+    s = '(S (NP (DT the) (NN cat)) (VP (VBD ate) (NP (DT a) (NN cookie))))'
     t = Tree.fromstring(s)
     print("Convert bracketed string into tree:")
     print(t)
@@ -1731,10 +1717,10 @@ def demo():
 
     # Demonstrate tree modification.
     the_cat = t[0]
-    the_cat.insert(1, Tree.fromstring("(JJ big)"))
+    the_cat.insert(1, Tree.fromstring('(JJ big)'))
     print("Tree modification:")
     print(t)
-    t[1, 1, 1] = Tree.fromstring("(NN cake)")
+    t[1, 1, 1] = Tree.fromstring('(NN cake)')
     print(t)
     print()
 
@@ -1748,7 +1734,7 @@ def demo():
     print()
 
     # Demonstrate probabilistic trees.
-    pt = ProbabilisticTree("x", ["y", "z"], prob=0.5)
+    pt = ProbabilisticTree('x', ['y', 'z'], prob=0.5)
     print("Probabilistic Tree:")
     print(pt)
     print()
@@ -1770,20 +1756,20 @@ def demo():
     print()
 
     # Demonstrate tree nodes containing objects other than strings
-    t.set_label(("test", 3))
+    t.set_label(('test', 3))
     print(t)
 
 
 __all__ = [
-    "ImmutableProbabilisticTree",
-    "ImmutableTree",
-    "ProbabilisticMixIn",
-    "ProbabilisticTree",
-    "Tree",
-    "bracket_parse",
-    "sinica_parse",
-    "ParentedTree",
-    "MultiParentedTree",
-    "ImmutableParentedTree",
-    "ImmutableMultiParentedTree",
+    'ImmutableProbabilisticTree',
+    'ImmutableTree',
+    'ProbabilisticMixIn',
+    'ProbabilisticTree',
+    'Tree',
+    'bracket_parse',
+    'sinica_parse',
+    'ParentedTree',
+    'MultiParentedTree',
+    'ImmutableParentedTree',
+    'ImmutableMultiParentedTree',
 ]
index 50b0bb0..260f431 100644 (file)
@@ -1,7 +1,7 @@
 # -*- coding: utf-8 -*-
 # Natural Language Toolkit: ASCII visualization of NLTK trees
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Andreas van Cranenburgh <A.W.vanCranenburgh@uva.nl>
 #         Peter Ljunglöf <peter.ljunglof@gu.se>
 # URL: <http://nltk.org/>
@@ -18,29 +18,30 @@ Graph Algorithms and Applications, 10(2) 141--157 (2006)149.
 http://jgaa.info/accepted/2006/EschbachGuentherBecker2006.10.2.pdf
 """
 
+from __future__ import division, print_function, unicode_literals
+
 import re
-try:
-    from html import escape
-except ImportError:
-    from cgi import escape
+from cgi import escape
 from collections import defaultdict
 from operator import itemgetter
 
 from nltk.util import OrderedDict
+from nltk.compat import python_2_unicode_compatible
 from nltk.tree import Tree
 
 ANSICOLOR = {
-    "black": 30,
-    "red": 31,
-    "green": 32,
-    "yellow": 33,
-    "blue": 34,
-    "magenta": 35,
-    "cyan": 36,
-    "white": 37,
+    'black': 30,
+    'red': 31,
+    'green': 32,
+    'yellow': 33,
+    'blue': 34,
+    'magenta': 35,
+    'cyan': 36,
+    'white': 37,
 }
 
 
+@python_2_unicode_compatible
 class TreePrettyPrinter(object):
     """
     Pretty-print a tree in text format, either as ASCII or Unicode.
@@ -92,8 +93,8 @@ class TreePrettyPrinter(object):
                             if not isinstance(b, Tree):
                                 a[n] = len(sentence)
                                 if type(b) == tuple:
-                                    b = "/".join(b)
-                                sentence.append("%s" % b)
+                                    b = '/'.join(b)
+                                sentence.append('%s' % b)
         self.nodes, self.coords, self.edges, self.highlight = self.nodecoords(
             tree, sentence, highlight
         )
@@ -102,7 +103,7 @@ class TreePrettyPrinter(object):
         return self.text()
 
     def __repr__(self):
-        return "<TreePrettyPrinter with %d nodes>" % len(self.nodes)
+        return '<TreePrettyPrinter with %d nodes>' % len(self.nodes)
 
     @staticmethod
     def nodecoords(tree, sentence, highlight):
@@ -190,27 +191,27 @@ class TreePrettyPrinter(object):
                             i += scale
                             j -= scale
             raise ValueError(
-                "could not find a free cell for:\n%s\n%s"
-                "min=%d; max=%d" % (tree[m], minidx, maxidx, dumpmatrix())
+                'could not find a free cell for:\n%s\n%s'
+                'min=%d; max=%d' % (tree[m], minidx, maxidx, dumpmatrix())
             )
 
         def dumpmatrix():
             """Dump matrix contents for debugging purposes."""
-            return "\n".join(
-                "%2d: %s" % (n, " ".join(("%2r" % i)[:2] for i in row))
+            return '\n'.join(
+                '%2d: %s' % (n, ' '.join(('%2r' % i)[:2] for i in row))
                 for n, row in enumerate(matrix)
             )
 
         leaves = tree.leaves()
         if not all(isinstance(n, int) for n in leaves):
-            raise ValueError("All leaves must be integer indices.")
+            raise ValueError('All leaves must be integer indices.')
         if len(leaves) != len(set(leaves)):
-            raise ValueError("Indices must occur at most once.")
+            raise ValueError('Indices must occur at most once.')
         if not all(0 <= n < len(sentence) for n in leaves):
             raise ValueError(
-                "All leaves must be in the interval 0..n "
-                "with n=len(sentence)\ntokens: %d indices: "
-                "%r\nsentence: %s" % (len(sentence), tree.leaves(), sentence)
+                'All leaves must be in the interval 0..n '
+                'with n=len(sentence)\ntokens: %d indices: '
+                '%r\nsentence: %s' % (len(sentence), tree.leaves(), sentence)
             )
         vertline, corner = -1, -2  # constants
         tree = tree.copy(True)
@@ -248,7 +249,7 @@ class TreePrettyPrinter(object):
             matrix[0][i] = ids[m]
             nodes[ids[m]] = sentence[tree[m]]
             if nodes[ids[m]] is None:
-                nodes[ids[m]] = "..."
+                nodes[ids[m]] = '...'
                 highlighted_nodes.discard(ids[m])
             positions.remove(m)
             childcols[m[:-1]].add((0, i))
@@ -334,9 +335,9 @@ class TreePrettyPrinter(object):
         unicodelines=False,
         html=False,
         ansi=False,
-        nodecolor="blue",
-        leafcolor="red",
-        funccolor="green",
+        nodecolor='blue',
+        leafcolor='red',
+        funccolor='green',
         abbreviate=None,
         maxwidth=16,
     ):
@@ -358,28 +359,28 @@ class TreePrettyPrinter(object):
         if abbreviate == True:
             abbreviate = 5
         if unicodelines:
-            horzline = "\u2500"
-            leftcorner = "\u250c"
-            rightcorner = "\u2510"
-            vertline = " \u2502 "
-            tee = horzline + "\u252C" + horzline
-            bottom = horzline + "\u2534" + horzline
-            cross = horzline + "\u253c" + horzline
-            ellipsis = "\u2026"
+            horzline = '\u2500'
+            leftcorner = '\u250c'
+            rightcorner = '\u2510'
+            vertline = ' \u2502 '
+            tee = horzline + '\u252C' + horzline
+            bottom = horzline + '\u2534' + horzline
+            cross = horzline + '\u253c' + horzline
+            ellipsis = '\u2026'
         else:
-            horzline = "_"
-            leftcorner = rightcorner = " "
-            vertline = " | "
+            horzline = '_'
+            leftcorner = rightcorner = ' '
+            vertline = ' | '
             tee = 3 * horzline
-            cross = bottom = "_|_"
-            ellipsis = "."
+            cross = bottom = '_|_'
+            ellipsis = '.'
 
         def crosscell(cur, x=vertline):
             """Overwrite center of this cell with a vertical branch."""
             splitl = len(cur) - len(cur) // 2 - len(x) // 2 - 1
             lst = list(cur)
             lst[splitl : splitl + len(x)] = list(x)
-            return "".join(lst)
+            return ''.join(lst)
 
         result = []
         matrix = defaultdict(dict)
@@ -391,7 +392,7 @@ class TreePrettyPrinter(object):
         childcols = defaultdict(set)
         labels = {}
         wrapre = re.compile(
-            "(.{%d,%d}\\b\\W*|.{%d})" % (maxwidth - 4, maxwidth, maxwidth)
+            '(.{%d,%d}\\b\\W*|.{%d})' % (maxwidth - 4, maxwidth, maxwidth)
         )
         # collect labels and coordinates
         for a in self.nodes:
@@ -406,8 +407,8 @@ class TreePrettyPrinter(object):
             if abbreviate and len(label) > abbreviate:
                 label = label[:abbreviate] + ellipsis
             if maxwidth and len(label) > maxwidth:
-                label = wrapre.sub(r"\1\n", label).strip()
-            label = label.split("\n")
+                label = wrapre.sub(r'\1\n', label).strip()
+            label = label.split('\n')
             maxnodeheight[row] = max(maxnodeheight[row], len(label))
             maxnodewith[column] = max(maxnodewith[column], max(map(len, label)))
             labels[a] = label
@@ -420,10 +421,10 @@ class TreePrettyPrinter(object):
         # bottom up level order traversal
         for row in sorted(matrix, reverse=True):
             noderows = [
-                ["".center(maxnodewith[col]) for col in range(maxcol + 1)]
+                [''.center(maxnodewith[col]) for col in range(maxcol + 1)]
                 for _ in range(maxnodeheight[row])
             ]
-            branchrow = ["".center(maxnodewith[col]) for col in range(maxcol + 1)]
+            branchrow = [''.center(maxnodewith[col]) for col in range(maxcol + 1)]
             for col in matrix[row]:
                 n = matrix[row][col]
                 node = self.nodes[n]
@@ -433,10 +434,10 @@ class TreePrettyPrinter(object):
                     if n in minchildcol and minchildcol[n] < maxchildcol[n]:
                         i, j = minchildcol[n], maxchildcol[n]
                         a, b = (maxnodewith[i] + 1) // 2 - 1, maxnodewith[j] // 2
-                        branchrow[i] = ((" " * a) + leftcorner).ljust(
+                        branchrow[i] = ((' ' * a) + leftcorner).ljust(
                             maxnodewith[i], horzline
                         )
-                        branchrow[j] = (rightcorner + (" " * b)).rjust(
+                        branchrow[j] = (rightcorner + (' ' * b)).rjust(
                             maxnodewith[j], horzline
                         )
                         for i in range(minchildcol[n] + 1, maxchildcol[n]):
@@ -453,22 +454,22 @@ class TreePrettyPrinter(object):
                         branchrow[col] = crosscell(branchrow[col])
                 text = [a.center(maxnodewith[col]) for a in text]
                 color = nodecolor if isinstance(node, Tree) else leafcolor
-                if isinstance(node, Tree) and node.label().startswith("-"):
+                if isinstance(node, Tree) and node.label().startswith('-'):
                     color = funccolor
                 if html:
-                    text = [escape(a, quote=False) for a in text]
+                    text = [escape(a) for a in text]
                     if n in self.highlight:
-                        text = ["<font color=%s>%s</font>" % (color, a) for a in text]
+                        text = ['<font color=%s>%s</font>' % (color, a) for a in text]
                 elif ansi and n in self.highlight:
-                    text = ["\x1b[%d;1m%s\x1b[0m" % (ANSICOLOR[color], a) for a in text]
+                    text = ['\x1b[%d;1m%s\x1b[0m' % (ANSICOLOR[color], a) for a in text]
                 for x in range(maxnodeheight[row]):
                     # draw vertical lines in partially filled multiline node
                     # labels, but only if it's not a frontier node.
                     noderows[x][col] = (
                         text[x]
                         if x < len(text)
-                        else (vertline if childcols[n] else " ").center(
-                            maxnodewith[col], " "
+                        else (vertline if childcols[n] else ' ').center(
+                            maxnodewith[col], ' '
                         )
                     )
             # for each column, if there is a node below us which has a parent
@@ -481,16 +482,16 @@ class TreePrettyPrinter(object):
                             for noderow in noderows:
                                 noderow[col] = crosscell(noderow[col])
                 branchrow = [
-                    a + ((a[-1] if a[-1] != " " else b[0]) * nodedist)
-                    for a, b in zip(branchrow, branchrow[1:] + [" "])
+                    a + ((a[-1] if a[-1] != ' ' else b[0]) * nodedist)
+                    for a, b in zip(branchrow, branchrow[1:] + [' '])
                 ]
-                result.append("".join(branchrow))
+                result.append(''.join(branchrow))
             result.extend(
-                (" " * nodedist).join(noderow) for noderow in reversed(noderows)
+                (' ' * nodedist).join(noderow) for noderow in reversed(noderows)
             )
-        return "\n".join(reversed(result)) + "\n"
+        return '\n'.join(reversed(result)) + '\n'
 
-    def svg(self, nodecolor="blue", leafcolor="red", funccolor="green"):
+    def svg(self, nodecolor='blue', leafcolor='red', funccolor='green'):
         """
         :return: SVG representation of a tree.
         """
@@ -563,10 +564,10 @@ class TreePrettyPrinter(object):
             y = row * vscale + vstart
             if n in self.highlight:
                 color = nodecolor if isinstance(node, Tree) else leafcolor
-                if isinstance(node, Tree) and node.label().startswith("-"):
+                if isinstance(node, Tree) and node.label().startswith('-'):
                     color = funccolor
             else:
-                color = "black"
+                color = 'black'
             result += [
                 '\t<text style="text-anchor: middle; fill: %s; '
                 'font-size: %dpx;" x="%g" y="%g">%s</text>'
@@ -575,12 +576,12 @@ class TreePrettyPrinter(object):
                     fontsize,
                     x,
                     y,
-                    escape(node.label() if isinstance(node, Tree) else node, quote=False),
+                    escape(node.label() if isinstance(node, Tree) else node),
                 )
             ]
 
-        result += ["</svg>"]
-        return "\n".join(result)
+        result += ['</svg>']
+        return '\n'.join(result)
 
 
 def test():
@@ -588,7 +589,7 @@ def test():
 
     def print_tree(n, tree, sentence=None, ansi=True, **xargs):
         print()
-        print('{0}: "{1}"'.format(n, " ".join(sentence or tree.leaves())))
+        print('{0}: "{1}"'.format(n, ' '.join(sentence or tree.leaves())))
         print(tree)
         print()
         drawtree = TreePrettyPrinter(tree, sentence)
@@ -603,23 +604,23 @@ def test():
         tree = treebank.parsed_sents()[n]
         print_tree(n, tree, nodedist=2, maxwidth=8)
     print()
-    print("ASCII version:")
+    print('ASCII version:')
     print(TreePrettyPrinter(tree).text(nodedist=2))
 
     tree = Tree.fromstring(
-        "(top (punct 8) (smain (noun 0) (verb 1) (inf (verb 5) (inf (verb 6) "
-        "(conj (inf (pp (prep 2) (np (det 3) (noun 4))) (verb 7)) (inf (verb 9)) "
-        "(vg 10) (inf (verb 11)))))) (punct 12))",
+        '(top (punct 8) (smain (noun 0) (verb 1) (inf (verb 5) (inf (verb 6) '
+        '(conj (inf (pp (prep 2) (np (det 3) (noun 4))) (verb 7)) (inf (verb 9)) '
+        '(vg 10) (inf (verb 11)))))) (punct 12))',
         read_leaf=int,
     )
     sentence = (
-        "Ze had met haar moeder kunnen gaan winkelen ,"
-        " zwemmen of terrassen .".split()
+        'Ze had met haar moeder kunnen gaan winkelen ,'
+        ' zwemmen of terrassen .'.split()
     )
-    print_tree("Discontinuous tree", tree, sentence, nodedist=2)
+    print_tree('Discontinuous tree', tree, sentence, nodedist=2)
 
 
-__all__ = ["TreePrettyPrinter"]
+__all__ = ['TreePrettyPrinter']
 
-if __name__ == "__main__":
+if __name__ == '__main__':
     test()
index 0c422f6..c64ac70 100644 (file)
@@ -106,6 +106,7 @@ The following is a short tutorial on the available transformations.
      C   D      C   D
 
 """
+from __future__ import print_function
 
 from nltk.tree import Tree
 
@@ -331,7 +332,7 @@ def demo():
     draw_trees(t, collapsedTree, cnfTree, parentTree, original)
 
 
-if __name__ == "__main__":
+if __name__ == '__main__':
     demo()
 
 __all__ = ["chomsky_normal_form", "un_chomsky_normal_form", "collapse_unary"]
index 1666e2c..2d848e0 100644 (file)
@@ -1,7 +1,7 @@
 # -*- coding: utf-8 -*-
 # Natural Language Toolkit: Twitter
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Ewan Klein <ewan@inf.ed.ac.uk>
 # URL: <http://nltk.org/>
 # For license information, see LICENSE.TXT
index 053ad6a..59d96cd 100644 (file)
Binary files a/nlp_resource_data/nltk/twitter/__pycache__/__init__.cpython-37.pyc and b/nlp_resource_data/nltk/twitter/__pycache__/__init__.cpython-37.pyc differ
index b098ccc..d7f7688 100644 (file)
Binary files a/nlp_resource_data/nltk/twitter/__pycache__/api.cpython-37.pyc and b/nlp_resource_data/nltk/twitter/__pycache__/api.cpython-37.pyc differ
index f04e698..9769110 100644 (file)
Binary files a/nlp_resource_data/nltk/twitter/__pycache__/common.cpython-37.pyc and b/nlp_resource_data/nltk/twitter/__pycache__/common.cpython-37.pyc differ
index ccbed61..be6afd1 100644 (file)
Binary files a/nlp_resource_data/nltk/twitter/__pycache__/twitter_demo.cpython-37.pyc and b/nlp_resource_data/nltk/twitter/__pycache__/twitter_demo.cpython-37.pyc differ
index 40849cc..a44a3ad 100644 (file)
Binary files a/nlp_resource_data/nltk/twitter/__pycache__/twitterclient.cpython-37.pyc and b/nlp_resource_data/nltk/twitter/__pycache__/twitterclient.cpython-37.pyc differ
index 8433890..91b8540 100644 (file)
Binary files a/nlp_resource_data/nltk/twitter/__pycache__/util.cpython-37.pyc and b/nlp_resource_data/nltk/twitter/__pycache__/util.cpython-37.pyc differ
index 1533ad2..2cce2b7 100644 (file)
@@ -1,7 +1,7 @@
 # -*- coding: utf-8 -*-
 # Natural Language Toolkit: Twitter API
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Ewan Klein <ewan@inf.ed.ac.uk>
 #         Lorenzo Rubio <lrnzcig@gmail.com>
 # URL: <http://nltk.org/>
@@ -14,7 +14,11 @@ handling.
 
 import time as _time
 from abc import ABCMeta, abstractmethod
-from datetime import tzinfo, timedelta, timezone, datetime
+from datetime import tzinfo, timedelta, datetime
+
+from six import add_metaclass
+
+from nltk.compat import UTC
 
 
 class LocalTimezoneOffsetWithUTC(tzinfo):
@@ -47,7 +51,8 @@ class LocalTimezoneOffsetWithUTC(tzinfo):
 LOCAL = LocalTimezoneOffsetWithUTC()
 
 
-class BasicTweetHandler(metaclass=ABCMeta):
+@add_metaclass(ABCMeta)
+class BasicTweetHandler(object):
     """
     Minimal implementation of `TweetHandler`.
 
@@ -124,9 +129,9 @@ class TweetHandlerI(BasicTweetHandler):
         Validate date limits.
         """
         if self.upper_date_limit or self.lower_date_limit:
-            date_fmt = "%a %b %d %H:%M:%S +0000 %Y"
-            tweet_date = datetime.strptime(data["created_at"], date_fmt).replace(
-                tzinfo=timezone.utc
+            date_fmt = '%a %b %d %H:%M:%S +0000 %Y'
+            tweet_date = datetime.strptime(data['created_at'], date_fmt).replace(
+                tzinfo=UTC
             )
             if (self.upper_date_limit and tweet_date > self.upper_date_limit) or (
                 self.lower_date_limit and tweet_date < self.lower_date_limit
index e4b3182..a06f08f 100644 (file)
@@ -1,7 +1,7 @@
 # -*- coding: utf-8 -*-
 # Natural Language Toolkit: Twitter client
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Ewan Klein <ewan@inf.ed.ac.uk>
 #         Lorenzo Rubio <lrnzcig@gmail.com>
 # URL: <http://nltk.org/>
 Utility functions for the :module:`twitterclient` module which do not require
 the `twython` library to have been installed.
 """
+from __future__ import print_function
+
 import csv
 import gzip
 import json
 
-from nltk.internals import deprecated
+from nltk import compat
 
 HIER_SEPARATOR = "."
 
@@ -34,7 +36,7 @@ def extract_fields(tweet, fields):
             _add_field_to_out(tweet, field, out)
         except TypeError:
             raise RuntimeError(
-                "Fatal error when extracting fields. Cannot find field ", field
+                'Fatal error when extracting fields. Cannot find field ', field
             )
     return out
 
@@ -72,7 +74,7 @@ def _get_entity_recursive(json, entity):
             # structure that contain other Twitter objects. See:
             # https://dev.twitter.com/overview/api/entities-in-twitter-objects
 
-            if key == "entities" or key == "extended_entities":
+            if key == 'entities' or key == 'extended_entities':
                 candidate = _get_entity_recursive(value, entity)
                 if candidate is not None:
                     return candidate
@@ -88,7 +90,7 @@ def _get_entity_recursive(json, entity):
 
 
 def json2csv(
-    fp, outfile, fields, encoding="utf8", errors="replace", gzip_compress=False
+    fp, outfile, fields, encoding='utf8', errors='replace', gzip_compress=False
 ):
     """
     Extract selected fields from a file of line-separated JSON tweets and
@@ -113,7 +115,7 @@ def json2csv(
     are 'id_str' for the tweetID and 'text' for the text of the tweet. See\
     <https://dev.twitter.com/overview/api/tweets> for a full list of fields.\
     e. g.: ['id_str'], ['id', 'text', 'favorite_count', 'retweet_count']\
-    Additionally, it allows IDs from other Twitter objects, e. g.,\
+    Additonally, it allows IDs from other Twitter objects, e. g.,\
     ['id', 'text', 'user.id', 'user.followers_count', 'user.friends_count']
 
     :param error: Behaviour for encoding errors, see\
@@ -121,7 +123,7 @@ def json2csv(
 
     :param gzip_compress: if `True`, output files are compressed with gzip
     """
-    (writer, outf) = _outf_writer(outfile, encoding, errors, gzip_compress)
+    (writer, outf) = outf_writer_compat(outfile, encoding, errors, gzip_compress)
     # write the list of fields as header
     writer.writerow(fields)
     # process the file
@@ -132,18 +134,22 @@ def json2csv(
     outf.close()
 
 
-@deprecated("Use open() and csv.writer() directly instead.")
 def outf_writer_compat(outfile, encoding, errors, gzip_compress=False):
-    """Get a CSV writer with optional compression."""
-    return _outf_writer(outfile, encoding, errors, gzip_compress)
-
-
-def _outf_writer(outfile, encoding, errors, gzip_compress=False):
-    if gzip_compress:
-        outf = gzip.open(outfile, "wt", encoding=encoding, errors=errors)
+    """
+    Identify appropriate CSV writer given the Python version
+    """
+    if compat.PY3:
+        if gzip_compress:
+            outf = gzip.open(outfile, 'wt', encoding=encoding, errors=errors)
+        else:
+            outf = open(outfile, 'w', encoding=encoding, errors=errors)
+        writer = csv.writer(outf)
     else:
-        outf = open(outfile, "w", encoding=encoding, errors=errors)
-    writer = csv.writer(outf)
+        if gzip_compress:
+            outf = gzip.open(outfile, 'wb')
+        else:
+            outf = open(outfile, 'wb')
+        writer = compat.UnicodeWriter(outf, encoding=encoding, errors=errors)
     return (writer, outf)
 
 
@@ -153,8 +159,8 @@ def json2csv_entities(
     main_fields,
     entity_type,
     entity_fields,
-    encoding="utf8",
-    errors="replace",
+    encoding='utf8',
+    errors='replace',
     gzip_compress=False,
 ):
     """
@@ -197,7 +203,7 @@ def json2csv_entities(
     :param gzip_compress: if `True`, ouput files are compressed with gzip
     """
 
-    (writer, outf) = _outf_writer(outfile, encoding, errors, gzip_compress)
+    (writer, outf) = outf_writer_compat(outfile, encoding, errors, gzip_compress)
     header = get_header_field_list(main_fields, entity_type, entity_fields)
     writer.writerow(header)
     for line in tweets_file:
index a241c07..967728b 100644 (file)
@@ -1,7 +1,7 @@
 # -*- coding: utf-8 -*-
 # Natural Language Toolkit: Twitter client
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Ewan Klein <ewan@inf.ed.ac.uk>
 #         Lorenzo Rubio <lrnzcig@gmail.com>
 # URL: <http://nltk.org/>
@@ -30,11 +30,13 @@ For documentation about the Twitter APIs, see `The Streaming APIs Overview
 For error codes see Twitter's
 `Error Codes and Responses <https://dev.twitter.com/overview/api/response-codes>`
 """
+from __future__ import print_function
 
 import datetime
 from functools import wraps
 import json
-from io import StringIO
+
+from nltk.compat import StringIO
 
 from nltk.twitter import (
     Query,
@@ -46,7 +48,7 @@ from nltk.twitter import (
 )
 
 
-SPACER = "###################################"
+SPACER = '###################################'
 
 
 def verbose(func):
@@ -79,10 +81,10 @@ def setup():
     """
     global USERIDS, FIELDS
 
-    USERIDS = ["759251", "612473", "15108702", "6017542", "2673523800"]
+    USERIDS = ['759251', '612473', '15108702', '6017542', '2673523800']
     # UserIDs corresponding to\
     #           @CNN,    @BBCNews, @ReutersLive, @BreakingNews, @AJELive
-    FIELDS = ["id_str"]
+    FIELDS = ['id_str']
 
 
 @verbose
@@ -92,18 +94,18 @@ def twitterclass_demo():
     """
     tw = Twitter()
     print("Track from the public stream\n")
-    tw.tweets(keywords="love, hate", limit=10)  # public stream
+    tw.tweets(keywords='love, hate', limit=10)  # public stream
     print(SPACER)
     print("Search past Tweets\n")
     tw = Twitter()
-    tw.tweets(keywords="love, hate", stream=False, limit=10)  # search past tweets
+    tw.tweets(keywords='love, hate', stream=False, limit=10)  # search past tweets
     print(SPACER)
     print(
         "Follow two accounts in the public stream"
         + " -- be prepared to wait a few minutes\n"
     )
     tw = Twitter()
-    tw.tweets(follow=["759251", "6017542"], stream=True, limit=5)  # public stream
+    tw.tweets(follow=['759251', '6017542'], stream=True, limit=5)  # public stream
 
 
 @verbose
@@ -129,18 +131,18 @@ def tracktoscreen_demo(track="taylor swift", limit=10):
 
 
 @verbose
-def search_demo(keywords="nltk"):
+def search_demo(keywords='nltk'):
     """
     Use the REST API to search for past tweets containing a given keyword.
     """
     oauth = credsfromfile()
     client = Query(**oauth)
     for tweet in client.search_tweets(keywords=keywords, limit=10):
-        print(tweet["text"])
+        print(tweet['text'])
 
 
 @verbose
-def tweets_by_user_demo(user="NLTK_org", count=200):
+def tweets_by_user_demo(user='NLTK_org', count=200):
     """
     Use the REST API to search for past tweets by a given user.
     """
@@ -159,9 +161,9 @@ def lookup_by_userid_demo():
     client = Query(**oauth)
     user_info = client.user_info_from_id(USERIDS)
     for info in user_info:
-        name = info["screen_name"]
-        followers = info["followers_count"]
-        following = info["friends_count"]
+        name = info['screen_name']
+        followers = info['followers_count']
+        following = info['friends_count']
         print("{0}, followers: {1}, following: {2}".format(name, followers, following))
 
 
@@ -209,7 +211,7 @@ def limit_by_time_demo(keywords="nltk"):
     print("Cutoff date: {}\n".format(dt_date))
 
     for tweet in client.search_tweets(keywords=keywords):
-        print("{} ".format(tweet["created_at"]), end="")
+        print("{} ".format(tweet['created_at']), end='')
         client.handler.handle(tweet)
 
 
@@ -269,12 +271,12 @@ def expand_tweetids_demo():
     hydrated = client.expand_tweetids(ids_f)
 
     for tweet in hydrated:
-        id_str = tweet["id_str"]
-        print("id: {}".format(id_str))
-        text = tweet["text"]
-        if text.startswith("@null"):
+        id_str = tweet['id_str']
+        print('id: {}'.format(id_str))
+        text = tweet['text']
+        if text.startswith('@null'):
             text = "[Tweet not available]"
-        print(text + "\n")
+        print(text + '\n')
 
 
 ALL = [
index a2af7af..9f79198 100644 (file)
@@ -1,7 +1,7 @@
 # -*- coding: utf-8 -*-
 # Natural Language Toolkit: Twitter client
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Ewan Klein <ewan@inf.ed.ac.uk>
 #         Lorenzo Rubio <lrnzcig@gmail.com>
 # URL: <http://nltk.org/>
@@ -69,7 +69,7 @@ class Streamer(TwythonStreamer):
         """
         if self.do_continue:
             if self.handler is not None:
-                if "text" in data:
+                if 'text' in data:
                     self.handler.counter += 1
                     self.handler.handle(data)
                     self.do_continue = self.handler.do_continue()
@@ -104,7 +104,7 @@ class Streamer(TwythonStreamer):
                     print("Error (stream will continue): {0}".format(e))
                 continue
 
-    def filter(self, track="", follow="", lang="en"):
+    def filter(self, track='', follow='', lang='en'):
         """
         Wrapper for 'statuses / filter' API call
         """
@@ -112,7 +112,7 @@ class Streamer(TwythonStreamer):
             # Stream in an endless loop until limit is reached
 
             try:
-                if track == "" and follow == "":
+                if track == '' and follow == '':
                     msg = "Please supply a value for 'track', 'follow'"
                     raise ValueError(msg)
                 self.statuses.filter(track=track, follow=follow, lang=lang)
@@ -167,7 +167,7 @@ class Query(Twython):
 
         return itertools.chain.from_iterable(chunked_tweets)
 
-    def _search_tweets(self, keywords, limit=100, lang="en"):
+    def _search_tweets(self, keywords, limit=100, lang='en'):
         """
         Assumes that the handler has been informed. Fetches Tweets from
         search_tweets generator output and passses them to handler
@@ -191,7 +191,7 @@ class Query(Twython):
         self,
         keywords,
         limit=100,
-        lang="en",
+        lang='en',
         max_id=None,
         retries_after_twython_exception=0,
     ):
@@ -199,7 +199,7 @@ class Query(Twython):
         Call the REST API ``'search/tweets'`` endpoint with some plausible
         defaults. See `the Twitter search documentation
         <https://dev.twitter.com/rest/public/search>`_ for more information
-        about admissible search parameters.
+        about admissable search parameters.
 
         :param str keywords: A list of query terms to search for, written as\
         a comma-separated string
@@ -220,16 +220,16 @@ class Query(Twython):
             self.handler.max_id = max_id
         else:
             results = self.search(
-                q=keywords, count=min(100, limit), lang=lang, result_type="recent"
+                q=keywords, count=min(100, limit), lang=lang, result_type='recent'
             )
-            count = len(results["statuses"])
+            count = len(results['statuses'])
             if count == 0:
                 print("No Tweets available through REST API for those keywords")
                 return
             count_from_query = count
-            self.handler.max_id = results["statuses"][count - 1]["id"] - 1
+            self.handler.max_id = results['statuses'][count - 1]['id'] - 1
 
-            for result in results["statuses"]:
+            for result in results['statuses']:
                 yield result
                 self.handler.counter += 1
                 if self.handler.do_continue() == False:
@@ -246,7 +246,7 @@ class Query(Twython):
                     count=mcount,
                     lang=lang,
                     max_id=self.handler.max_id,
-                    result_type="recent",
+                    result_type='recent',
                 )
             except TwythonRateLimitError as e:
                 print("Waiting for 15 minutes -{0}".format(e))
@@ -258,7 +258,7 @@ class Query(Twython):
                     raise e
                 retries += 1
 
-            count = len(results["statuses"])
+            count = len(results['statuses'])
             if count == 0:
                 print("No more Tweets available through rest api")
                 return
@@ -267,9 +267,9 @@ class Query(Twython):
             # results['search_metadata']['next_results'], but as part of a
             # query and difficult to fetch. This is doing the equivalent
             # (last tweet id minus one)
-            self.handler.max_id = results["statuses"][count - 1]["id"] - 1
+            self.handler.max_id = results['statuses'][count - 1]['id'] - 1
 
-            for result in results["statuses"]:
+            for result in results['statuses']:
                 yield result
                 self.handler.counter += 1
                 if self.handler.do_continue() == False:
@@ -286,7 +286,7 @@ class Query(Twython):
         """
         return [self.show_user(user_id=userid) for userid in userids]
 
-    def user_tweets(self, screen_name, limit, include_rts="false"):
+    def user_tweets(self, screen_name, limit, include_rts='false'):
         """
         Return a collection of the most recent Tweets posted by the user
 
@@ -315,13 +315,13 @@ class Twitter(object):
 
     def tweets(
         self,
-        keywords="",
-        follow="",
+        keywords='',
+        follow='',
         to_screen=True,
         stream=True,
         limit=100,
         date_limit=None,
-        lang="en",
+        lang='en',
         repeat=False,
         gzip_compress=False,
     ):
@@ -398,13 +398,13 @@ class Twitter(object):
 
         if stream:
             self.streamer.register(handler)
-            if keywords == "" and follow == "":
+            if keywords == '' and follow == '':
                 self.streamer.sample()
             else:
                 self.streamer.filter(track=keywords, follow=follow, lang=lang)
         else:
             self.query.register(handler)
-            if keywords == "":
+            if keywords == '':
                 raise ValueError("Please supply at least one keyword to search for.")
             else:
                 self.query._search_tweets(keywords, limit=limit, lang=lang)
@@ -423,7 +423,7 @@ class TweetViewer(TweetHandlerI):
         :rtype: bool
         :param data: Tweet object returned by Twitter API
         """
-        text = data["text"]
+        text = data['text']
         print(text)
 
         self.check_date_limit(data)
@@ -431,7 +431,7 @@ class TweetViewer(TweetHandlerI):
             return
 
     def on_finish(self):
-        print("Written {0} Tweets".format(self.counter))
+        print('Written {0} Tweets'.format(self.counter))
 
 
 class TweetWriter(TweetHandlerI):
@@ -444,8 +444,8 @@ class TweetWriter(TweetHandlerI):
         limit=2000,
         upper_date_limit=None,
         lower_date_limit=None,
-        fprefix="tweets",
-        subdir="twitter-files",
+        fprefix='tweets',
+        subdir='twitter-files',
         repeat=False,
         gzip_compress=False,
     ):
@@ -497,13 +497,13 @@ class TweetWriter(TweetHandlerI):
                 os.mkdir(subdir)
 
         fname = os.path.join(subdir, fprefix)
-        fmt = "%Y%m%d-%H%M%S"
+        fmt = '%Y%m%d-%H%M%S'
         timestamp = datetime.datetime.now().strftime(fmt)
         if self.gzip_compress:
-            suffix = ".gz"
+            suffix = '.gz'
         else:
-            suffix = ""
-        outfile = "{0}.{1}.json{2}".format(fname, timestamp, suffix)
+            suffix = ''
+        outfile = '{0}.{1}.json{2}'.format(fname, timestamp, suffix)
         return outfile
 
     def handle(self, data):
@@ -515,14 +515,14 @@ class TweetWriter(TweetHandlerI):
         """
         if self.startingup:
             if self.gzip_compress:
-                self.output = gzip.open(self.fname, "w")
+                self.output = gzip.open(self.fname, 'w')
             else:
-                self.output = open(self.fname, "w")
-            print("Writing to {0}".format(self.fname))
+                self.output = open(self.fname, 'w')
+            print('Writing to {0}'.format(self.fname))
 
         json_data = json.dumps(data)
         if self.gzip_compress:
-            self.output.write((json_data + "\n").encode("utf-8"))
+            self.output.write((json_data + "\n").encode('utf-8'))
         else:
             self.output.write(json_data + "\n")
 
@@ -533,7 +533,7 @@ class TweetWriter(TweetHandlerI):
         self.startingup = False
 
     def on_finish(self):
-        print("Written {0} Tweets".format(self.counter))
+        print('Written {0} Tweets'.format(self.counter))
         if self.output:
             self.output.close()
 
index 1d859f9..888ed75 100644 (file)
@@ -1,7 +1,7 @@
 # -*- coding: utf-8 -*-
 # Natural Language Toolkit: Twitter client
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Ewan Klein <ewan@inf.ed.ac.uk>
 #         Lorenzo Rubio <lrnzcig@gmail.com>
 # URL: <http://nltk.org/>
@@ -11,6 +11,8 @@
 Authentication utilities to accompany :module:`twitterclient`.
 """
 
+from __future__ import print_function
+
 import os
 import pprint
 from twython import Twython
@@ -31,12 +33,12 @@ class Authenticate(object):
     """
 
     def __init__(self):
-        self.creds_file = "credentials.txt"
+        self.creds_file = 'credentials.txt'
         self.creds_fullpath = None
 
         self.oauth = {}
         try:
-            self.twitter_dir = os.environ["TWITTER"]
+            self.twitter_dir = os.environ['TWITTER']
             self.creds_subdir = self.twitter_dir
         except KeyError:
             self.twitter_dir = None
@@ -84,15 +86,15 @@ class Authenticate(object):
         )
 
         if not os.path.isfile(self.creds_fullpath):
-            raise OSError("Cannot find file {}".format(self.creds_fullpath))
+            raise OSError('Cannot find file {}'.format(self.creds_fullpath))
 
         with open(self.creds_fullpath) as infile:
             if verbose:
-                print("Reading credentials file {}".format(self.creds_fullpath))
+                print('Reading credentials file {}'.format(self.creds_fullpath))
 
             for line in infile:
-                if "=" in line:
-                    name, value = line.split("=", 1)
+                if '=' in line:
+                    name, value = line.split('=', 1)
                     self.oauth[name.strip()] = value.strip()
 
         self._validate_creds_file(verbose=verbose)
@@ -102,16 +104,16 @@ class Authenticate(object):
     def _validate_creds_file(self, verbose=False):
         """Check validity of a credentials file."""
         oauth1 = False
-        oauth1_keys = ["app_key", "app_secret", "oauth_token", "oauth_token_secret"]
+        oauth1_keys = ['app_key', 'app_secret', 'oauth_token', 'oauth_token_secret']
         oauth2 = False
-        oauth2_keys = ["app_key", "app_secret", "access_token"]
+        oauth2_keys = ['app_key', 'app_secret', 'access_token']
         if all(k in self.oauth for k in oauth1_keys):
             oauth1 = True
         elif all(k in self.oauth for k in oauth2_keys):
             oauth2 = True
 
         if not (oauth1 or oauth2):
-            msg = "Missing or incorrect entries in {}\n".format(self.creds_file)
+            msg = 'Missing or incorrect entries in {}\n'.format(self.creds_file)
             msg += pprint.pformat(self.oauth)
             raise ValueError(msg)
         elif verbose:
@@ -125,15 +127,15 @@ def add_access_token(creds_file=None):
     """
     if creds_file is None:
         path = os.path.dirname(__file__)
-        creds_file = os.path.join(path, "credentials2.txt")
+        creds_file = os.path.join(path, 'credentials2.txt')
     oauth2 = credsfromfile(creds_file=creds_file)
-    app_key = oauth2["app_key"]
-    app_secret = oauth2["app_secret"]
+    app_key = oauth2['app_key']
+    app_secret = oauth2['app_secret']
 
     twitter = Twython(app_key, app_secret, oauth_version=2)
     access_token = twitter.obtain_access_token()
-    tok = "access_token={}\n".format(access_token)
-    with open(creds_file, "a") as infile:
+    tok = 'access_token={}\n'.format(access_token)
+    with open(creds_file, 'a') as infile:
         print(tok, file=infile)
 
 
index baff54e..b4c5b00 100644 (file)
@@ -1,9 +1,10 @@
 # Natural Language Toolkit: Utility functions
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # Author: Steven Bird <stevenbird1@gmail.com>
 # URL: <http://nltk.org/>
 # For license information, see LICENSE.TXT
+from __future__ import print_function
 
 import sys
 import inspect
@@ -15,12 +16,13 @@ import pydoc
 import bisect
 import os
 
-from itertools import islice, chain, combinations, tee
+from itertools import islice, chain, combinations
 from pprint import pprint
 from collections import defaultdict, deque
 from sys import version_info
 
-from urllib.request import (
+from six import class_types, string_types, text_type
+from six.moves.urllib.request import (
     build_opener,
     install_opener,
     getproxies,
@@ -32,6 +34,7 @@ from urllib.request import (
 
 from nltk.internals import slice_bounds, raise_unorderable_types
 from nltk.collections import *
+from nltk.compat import python_2_unicode_compatible
 
 
 ######################################################################
@@ -39,34 +42,37 @@ from nltk.collections import *
 ######################################################################
 
 
-def usage(obj, selfname="self"):
+def usage(obj, selfname='self'):
     str(obj)  # In case it's lazy, this will load it.
 
-    if not isinstance(obj, type):
+    if not isinstance(obj, class_types):
         obj = obj.__class__
 
-    print("%s supports the following operations:" % obj.__name__)
+    print('%s supports the following operations:' % obj.__name__)
     for (name, method) in sorted(pydoc.allmethods(obj).items()):
-        if name.startswith("_"):
+        if name.startswith('_'):
             continue
-        if getattr(method, "__deprecated__", False):
+        if getattr(method, '__deprecated__', False):
             continue
 
-        getargspec = inspect.getfullargspec
+        if sys.version_info[0] >= 3:
+            getargspec = inspect.getfullargspec
+        else:
+            getargspec = inspect.getargspec
         args, varargs, varkw, defaults = getargspec(method)[:4]
         if (
             args
-            and args[0] == "self"
+            and args[0] == 'self'
             and (defaults is None or len(args) > len(defaults))
         ):
             args = args[1:]
-            name = "%s.%s" % (selfname, name)
+            name = '%s.%s' % (selfname, name)
         argspec = inspect.formatargspec(args, varargs, varkw, defaults)
         print(
             textwrap.fill(
-                "%s%s" % (name, argspec),
-                initial_indent="  - ",
-                subsequent_indent=" " * (len(name) + 5),
+                '%s%s' % (name, argspec),
+                initial_indent='  - ',
+                subsequent_indent=' ' * (len(name) + 5),
             )
         )
 
@@ -89,7 +95,7 @@ def in_idle():
     """
     import sys
 
-    return sys.stdin.__class__.__name__ in ("PyShell", "RPCProxy")
+    return sys.stdin.__class__.__name__ in ('PyShell', 'RPCProxy')
 
 
 ##########################################################################
@@ -120,7 +126,7 @@ def print_string(s, width=70):
     :param width: the display width
     :type width: int
     """
-    print("\n".join(textwrap.wrap(s, width=width)))
+    print('\n'.join(textwrap.wrap(s, width=width)))
 
 
 def tokenwrap(tokens, separator=" ", width=70):
@@ -134,7 +140,7 @@ def tokenwrap(tokens, separator=" ", width=70):
     :param width: the display width (default=70)
     :type width: int
     """
-    return "\n".join(textwrap.wrap(separator.join(tokens), width=width))
+    return '\n'.join(textwrap.wrap(separator.join(tokens), width=width))
 
 
 ##########################################################################
@@ -196,10 +202,10 @@ def re_show(regexp, string, left="{", right="}"):
 
 # recipe from David Mertz
 def filestring(f):
-    if hasattr(f, "read"):
+    if hasattr(f, 'read'):
         return f.read()
-    elif isinstance(f, str):
-        with open(f, "r") as infile:
+    elif isinstance(f, string_types):
+        with open(f, 'r') as infile:
             return infile.read()
     else:
         raise ValueError("Must be called with a filename or file-like object")
@@ -253,7 +259,7 @@ def guess_encoding(data):
     """
     successful_encoding = None
     # we make 'utf-8' the first encoding
-    encodings = ["utf-8"]
+    encodings = ['utf-8']
     #
     # next we add anything we can learn from the locale
     try:
@@ -270,14 +276,14 @@ def guess_encoding(data):
         pass
     #
     # we try 'latin-1' last
-    encodings.append("latin-1")
+    encodings.append('latin-1')
     for enc in encodings:
         # some of the locale calls
         # may have returned None
         if not enc:
             continue
         try:
-            decoded = str(data, enc)
+            decoded = text_type(data, enc)
             successful_encoding = enc
 
         except (UnicodeError, LookupError):
@@ -286,9 +292,9 @@ def guess_encoding(data):
             break
     if not successful_encoding:
         raise UnicodeError(
-            "Unable to decode input data. "
-            "Tried the following encodings: %s."
-            % ", ".join([repr(enc) for enc in encodings if enc])
+            'Unable to decode input data. '
+            'Tried the following encodings: %s.'
+            % ', '.join([repr(enc) for enc in encodings if enc])
         )
     else:
         return (decoded, successful_encoding)
@@ -313,7 +319,7 @@ def unique_list(xs):
 def invert_dict(d):
     inverted_dict = defaultdict(list)
     for key in d:
-        if hasattr(d[key], "__iter__"):
+        if hasattr(d[key], '__iter__'):
             for term in d[key]:
                 inverted_dict[term].append(key)
         else:
@@ -622,7 +628,7 @@ def skipgrams(sequence, n, k, **kwargs):
     """
 
     # Pads the sequence as desired by **kwargs.
-    if "pad_left" in kwargs or "pad_right" in kwargs:
+    if 'pad_left' in kwargs or 'pad_right' in kwargs:
         sequence = pad_sequence(sequence, n, **kwargs)
 
     # Note when iterating through the ngrams, the pad_right here is not
@@ -654,12 +660,12 @@ def binary_search_file(file, key, cache={}, cacheDepth=-1):
     :param key: the identifier we are searching for.
     """
 
-    key = key + " "
+    key = key + ' '
     keylen = len(key)
     start = 0
     currentDepth = 0
 
-    if hasattr(file, "name"):
+    if hasattr(file, 'name'):
         end = os.stat(file.name).st_size - 1
     else:
         file.seek(0, 2)
@@ -717,7 +723,7 @@ def binary_search_file(file, key, cache={}, cacheDepth=-1):
 ######################################################################
 
 
-def set_proxy(proxy, user=None, password=""):
+def set_proxy(proxy, user=None, password=''):
     """
     Set the HTTP proxy for Python to download through.
 
@@ -730,15 +736,17 @@ def set_proxy(proxy, user=None, password=""):
         authentication.
     :param password: The password to authenticate with.
     """
+    from nltk import compat
+
     if proxy is None:
         # Try and find the system proxy settings
         try:
-            proxy = getproxies()["http"]
+            proxy = getproxies()['http']
         except KeyError:
-            raise ValueError("Could not detect default proxy settings")
+            raise ValueError('Could not detect default proxy settings')
 
     # Set up the proxy handler
-    proxy_handler = ProxyHandler({"https": proxy, "http": proxy})
+    proxy_handler = ProxyHandler({'https': proxy, 'http': proxy})
     opener = build_opener(proxy_handler)
 
     if user is not None:
@@ -817,29 +825,3 @@ def choose(n, k):
         return ntok // ktok
     else:
         return 0
-
-
-######################################################################
-# Iteration utilities
-######################################################################
-
-
-def pairwise(iterable):
-    """s -> (s0,s1), (s1,s2), (s2, s3), ..."""
-    a, b = tee(iterable)
-    next(b, None)
-    return zip(a, b)
-
-######################################################################
-# Parallization.
-######################################################################
-
-
-def parallelize_preprocess(func, iterator, processes, progress_bar=False):
-    from tqdm import tqdm
-    from joblib import Parallel, delayed
-
-    iterator = tqdm(iterator) if progress_bar else iterator
-    if processes <= 1:
-        return map(func, iterator)
-    return Parallel(n_jobs=processes)(delayed(func)(line) for line in iterator)
index ed9599c..611f649 100644 (file)
@@ -3,7 +3,7 @@
 # Authors: Liling Tan <alvations@gmail.com>,
 #          Dmitrijs Milajevs <dimazest@gmail.com>
 #
-# Copyright (C) 2001-2020 NLTK Project
+# Copyright (C) 2001-2019 NLTK Project
 # URL: <http://nltk.org/>
 # For license information, see LICENSE.TXT